我正在写一个爬虫(gevent+requests+redis-py),出现了一些问题,看看各位有啥好的解决方案没?

penkzhou 发布于 2014年06月05日
无人欣赏。

我的爬虫大致思想的是这样的,我想爬取某些列表页面上所有的列表url,有很多页,我遍历这些页面,然后抓去这些页面上的内容,当某个页面请求出错的时候,我就将它保存到一个数据库,下次从这个数据库里面把错误的取出来,然后再处理,这样一直循环,直到所有的都被处理完。不多说了,直接代码吧(更详细的问题描述见代码的注释)代码有点乱了,gist地址:https://gist.github.com/penkzhou/a657720be302f72269ca :

    # _*_ coding: utf-8 _*_
import sys
reload(sys)
sys.setdefaultencoding("utf8")
from gevent import monkey
monkey.patch_all()
import requests
import redis
import gevent
from gevent.pool import Pool
from bs4 import BeautifulSoup
import time
from pymongo import MongoClient, ReadPreference
import json
import redis.connection
redis.connection.socket = gevent.socket
mongo_connection = MongoClient(
    '%s:%d' % (
        JobProjectConfiguration.save_mongo_host,
        JobProjectConfiguration.save_mongo_port),
    read_preference=ReadPreference.SECONDARY,
    max_pool_size=10, use_greenlets=True)

mongo_db = mongo_connection.jobdigg

redis_connection = redis.ConnectionPool(
    host=JobProjectConfiguration.url_queue_redis_host,
    port=JobProjectConfiguration.url_queue_redis_port,
    db=JobProjectConfiguration.url_queue_redis_db
    )

redis_proxy_pool = redis.ConnectionPool(
    host=JobProjectConfiguration.proxy_queue_redis_host,
    port=JobProjectConfiguration.proxy_queue_redis_port,
    db=JobProjectConfiguration.proxy_queue_redis_db
    )


proxy_pool = []
pool_num = 100


header = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Encoding": "gzip,deflate,sdch",
    "Accept-Language": "zh-CN,zh;q=0.8",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36"
    }


def WYUrlGenerator():
    print '51 Dig start : the url...'
    start = time.time()
    redis_db = redis.Redis(connection_pool=redis_connection)
    urllist = WYJobUrlYield()
    gpool = Pool(pool_num)
    for uargs in urllist:
        gpool.spawn(GenerateUrl, uargs)
    gpool.join()
    # 从这里开始,循环的从错误url集合里面取url,直至取完所有的
    length = redis_db.scard("error_url_list")
    while length > 0:
        errorlist = ErrorUrlGenerator()
        epool = Pool(pool_num)
        for url in errorlist:
            epool.spawn(GenerateUrl, url)
        epool.join()
        length = redis_db.scard("error_url_list")
    end = time.time()
    print 'dig end : the url...all spend time is %0.2f' % (end - start)


def WYJobUrlYield():
    for page in xrange(3000):
        page += 1
        url = "http://some.crawl.url with page num %s" % page
        jobitem = {
            "url": url,
            "type": "jobtype"
            } 
        jobvalue = json.dumps(jobitem)
        yield jobvalue


#从错误url的集合里面取出url 再次处理
def ErrorUrlGenerator():
    redis_db = redis.Redis(connection_pool=redis_connection)
    urllist = redis_db.smembers("error_url_list")
    for url in urllist:
        yield url


def GenerateUrl(sourcejob):
    redis_db = redis.StrictRedis(connection_pool=redis_connection)
    pipe = redis_db.pipeline()
    newitem = json.loads(sourcejob)
    url = newitem["url"]
    urltype = newitem["type"]
    try:
        ip = proxy_pool.getProxy()
        proxy = {"http": "http://"+ip["proxy"]}
        timeout = gevent.Timeout(5, ConnectionError)#在这里设置超时,保证每次请求不会太长,阻塞后面的请求,超过5秒抛出错误
        timeout.start()
        r = requests.get(url, headers=header, proxies=proxy)
        jobs = BeautifulSoup(r.text)
        if urltype == "urltype":  #获取页面的所有url,然后保存到redis的一个set里面
            results = jobs.findAll("a", {"class": "classname"})
            for result in results:
                url = result["href"]
                urlitem = {
                    "url": url,
                    "type": "urltype"
                    }
                urlvalue = json.dumps(urlitem)
                pipe.sadd("url_list", urlitem)   # 这里将获取的url保存至url_list 这个redis集合里面
        pipe.srem("error_url_list", sourcejob)   #运行到这里,说明当前url如果是错误的url,那么就已经被处理,在这里删除掉
        pipe.execute()
    except Exception as e:
        error_name = e.__class__.__name__
        if error_name == "ConnectionError" or error_name == "ProxyError":    
                #通过判断错误类型(因为一些链接或者代理错误,
                #我会直接扔回专门保存错误url的集合里面,然后下次再去取出来处理)
            redis_db.sadd('error_url_list', sourcejob)    
            #现在我面临最恼火的问题就是其它比较正常,就在这里,
                #当程序开启的时候,偶尔会出现sadd抛出异常
            #因为这里是出了异常才在这里处理错误的url的
                #(将它保存到error_url_list,供后来的再处理),现在添加的时候出了异常,
            #这样就会让一大部分错误的url无法保存到对应的数据库,最后导致爬到的数据太少,
            #异常信息大致为:
#                ConnectionError
# <timer at 0x36c8c80 callback=<bound method Greenlet.throw of <Greenlet at 0xc844050>> args=(<class 'requests.exceptions.ConnectionError'>,)> failed with ConnectionError

# Traceback (most recent call last):
#   File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/greenlet.py", line 327, in run
#     result = self._run(*self.args, **self.kwargs)
#   File "61.py", line 147, in GenerateUrl
#     redis_db.sadd('error_url_list', sourcejob)
#   File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/client.py", line 1248, in sadd
#     return self.execute_command('SADD', name, *values)
#   File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/client.py", line 461, in execute_command
#     return self.parse_response(connection, command_name, **options)
#   File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/client.py", line 471, in parse_response
#     response = connection.read_response()
#   File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/connection.py", line 339, in read_response
#     response = self._parser.read_response()
#   File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/connection.py", line 110, in read_response
#     response = self.read()
#   File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/connection.py", line 103, in read
#     return self._fp.readline()[:-2]
#   File "/usr/local/lib/python2.7/socket.py", line 447, in readline
#     data = self._sock.recv(self._rbufsize)
#   File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/socket.py", line 392, in recv
#     self._wait(self._read_event)
#   File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/socket.py", line 298, in _wait
#     self.hub.wait(watcher)
#   File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/hub.py", line 341, in wait
#     result = waiter.get()
#   File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/hub.py", line 568, in get
#     return self.hub.switch()
#   File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/hub.py", line 331, in switch
#     return greenlet.switch(self)





if __name__ == '__main__':
    st = time.time()
    time.sleep(5)
    WYUrlGenerator()
    et = time.time()
    print "**************end****************,the spend time is %0.2f" % (et - st)

不知道各位对我这段代码有什么看法,或者吐槽也行,自己找了一些相关资料,成效不大。

共2条回复
路人甲 回复于 2014年06月05日

python有个爬虫框架,scrapy,可以看看

penkzhou 回复于 2014年06月05日

1楼 @路人甲 嗯,我想先解决这个问题吧

登录 或者 注册