我的爬虫大致思想的是这样的,我想爬取某些列表页面上所有的列表url,有很多页,我遍历这些页面,然后抓去这些页面上的内容,当某个页面请求出错的时候,我就将它保存到一个数据库,下次从这个数据库里面把错误的取出来,然后再处理,这样一直循环,直到所有的都被处理完。不多说了,直接代码吧(更详细的问题描述见代码的注释)代码有点乱了,gist地址:https://gist.github.com/penkzhou/a657720be302f72269ca :
# _*_ coding: utf-8 _*_
import sys
reload(sys)
sys.setdefaultencoding("utf8")
from gevent import monkey
monkey.patch_all()
import requests
import redis
import gevent
from gevent.pool import Pool
from bs4 import BeautifulSoup
import time
from pymongo import MongoClient, ReadPreference
import json
import redis.connection
redis.connection.socket = gevent.socket
mongo_connection = MongoClient(
'%s:%d' % (
JobProjectConfiguration.save_mongo_host,
JobProjectConfiguration.save_mongo_port),
read_preference=ReadPreference.SECONDARY,
max_pool_size=10, use_greenlets=True)
mongo_db = mongo_connection.jobdigg
redis_connection = redis.ConnectionPool(
host=JobProjectConfiguration.url_queue_redis_host,
port=JobProjectConfiguration.url_queue_redis_port,
db=JobProjectConfiguration.url_queue_redis_db
)
redis_proxy_pool = redis.ConnectionPool(
host=JobProjectConfiguration.proxy_queue_redis_host,
port=JobProjectConfiguration.proxy_queue_redis_port,
db=JobProjectConfiguration.proxy_queue_redis_db
)
proxy_pool = []
pool_num = 100
header = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip,deflate,sdch",
"Accept-Language": "zh-CN,zh;q=0.8",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36"
}
def WYUrlGenerator():
print '51 Dig start : the url...'
start = time.time()
redis_db = redis.Redis(connection_pool=redis_connection)
urllist = WYJobUrlYield()
gpool = Pool(pool_num)
for uargs in urllist:
gpool.spawn(GenerateUrl, uargs)
gpool.join()
# 从这里开始,循环的从错误url集合里面取url,直至取完所有的
length = redis_db.scard("error_url_list")
while length > 0:
errorlist = ErrorUrlGenerator()
epool = Pool(pool_num)
for url in errorlist:
epool.spawn(GenerateUrl, url)
epool.join()
length = redis_db.scard("error_url_list")
end = time.time()
print 'dig end : the url...all spend time is %0.2f' % (end - start)
def WYJobUrlYield():
for page in xrange(3000):
page += 1
url = "http://some.crawl.url with page num %s" % page
jobitem = {
"url": url,
"type": "jobtype"
}
jobvalue = json.dumps(jobitem)
yield jobvalue
#从错误url的集合里面取出url 再次处理
def ErrorUrlGenerator():
redis_db = redis.Redis(connection_pool=redis_connection)
urllist = redis_db.smembers("error_url_list")
for url in urllist:
yield url
def GenerateUrl(sourcejob):
redis_db = redis.StrictRedis(connection_pool=redis_connection)
pipe = redis_db.pipeline()
newitem = json.loads(sourcejob)
url = newitem["url"]
urltype = newitem["type"]
try:
ip = proxy_pool.getProxy()
proxy = {"http": "http://"+ip["proxy"]}
timeout = gevent.Timeout(5, ConnectionError)#在这里设置超时,保证每次请求不会太长,阻塞后面的请求,超过5秒抛出错误
timeout.start()
r = requests.get(url, headers=header, proxies=proxy)
jobs = BeautifulSoup(r.text)
if urltype == "urltype": #获取页面的所有url,然后保存到redis的一个set里面
results = jobs.findAll("a", {"class": "classname"})
for result in results:
url = result["href"]
urlitem = {
"url": url,
"type": "urltype"
}
urlvalue = json.dumps(urlitem)
pipe.sadd("url_list", urlitem) # 这里将获取的url保存至url_list 这个redis集合里面
pipe.srem("error_url_list", sourcejob) #运行到这里,说明当前url如果是错误的url,那么就已经被处理,在这里删除掉
pipe.execute()
except Exception as e:
error_name = e.__class__.__name__
if error_name == "ConnectionError" or error_name == "ProxyError":
#通过判断错误类型(因为一些链接或者代理错误,
#我会直接扔回专门保存错误url的集合里面,然后下次再去取出来处理)
redis_db.sadd('error_url_list', sourcejob)
#现在我面临最恼火的问题就是其它比较正常,就在这里,
#当程序开启的时候,偶尔会出现sadd抛出异常
#因为这里是出了异常才在这里处理错误的url的
#(将它保存到error_url_list,供后来的再处理),现在添加的时候出了异常,
#这样就会让一大部分错误的url无法保存到对应的数据库,最后导致爬到的数据太少,
#异常信息大致为:
# ConnectionError
# <timer at 0x36c8c80 callback=<bound method Greenlet.throw of <Greenlet at 0xc844050>> args=(<class 'requests.exceptions.ConnectionError'>,)> failed with ConnectionError
# Traceback (most recent call last):
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/greenlet.py", line 327, in run
# result = self._run(*self.args, **self.kwargs)
# File "61.py", line 147, in GenerateUrl
# redis_db.sadd('error_url_list', sourcejob)
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/client.py", line 1248, in sadd
# return self.execute_command('SADD', name, *values)
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/client.py", line 461, in execute_command
# return self.parse_response(connection, command_name, **options)
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/client.py", line 471, in parse_response
# response = connection.read_response()
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/connection.py", line 339, in read_response
# response = self._parser.read_response()
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/connection.py", line 110, in read_response
# response = self.read()
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/connection.py", line 103, in read
# return self._fp.readline()[:-2]
# File "/usr/local/lib/python2.7/socket.py", line 447, in readline
# data = self._sock.recv(self._rbufsize)
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/socket.py", line 392, in recv
# self._wait(self._read_event)
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/socket.py", line 298, in _wait
# self.hub.wait(watcher)
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/hub.py", line 341, in wait
# result = waiter.get()
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/hub.py", line 568, in get
# return self.hub.switch()
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/hub.py", line 331, in switch
# return greenlet.switch(self)
if __name__ == '__main__':
st = time.time()
time.sleep(5)
WYUrlGenerator()
et = time.time()
print "**************end****************,the spend time is %0.2f" % (et - st)
不知道各位对我这段代码有什么看法,或者吐槽也行,自己找了一些相关资料,成效不大。