1.报错:'NoneType' object has no attribute 'get_text'
是取不存在值所致,在get_text之前判断一下,即:
if title is None: return
2.防止程序因错误终止,多用try即:
try:
...
except Exception as e:
print(e)
3.请求太频繁,写在requests底下
s = requests.session()
s.keep_alive = False
4.多线程并发抓取
from threading import Thread
from Queue import Queue
from time import sleep
# q是任务队列
#NUM是并发线程总数
#JOBS是有多少任务
q = Queue()
NUM = 2
JOBS = 10
#具体的处理函数,负责处理单个任务
def do_somthing_using(arguments):
print arguments
#这个是工作进程,负责不断从队列取数据并处理
def working():
while True:
arguments = q.get()
do_somthing_using(arguments)
sleep(1)
q.task_done()
#fork NUM个线程等待队列
for i in range(NUM):
t = Thread(target=working)
t.setDaemon(True)
t.start()
#把JOBS排入队列
for i in range(JOBS):
q.put(i)
#等待所有JOBS完成
q.join()