主页 M

python多线程BeautifulSoup采集诗词网,设置爬多url

2020-05-07 网页编程网 网页编程网
# -*- coding: utf-8 -*-
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor,wait,ALL_COMPLETED
urls={
      'https://so.gushiwen.org/gushi/tangshi.aspx',
      'https://so.gushiwen.org/gushi/songci.aspx',
      'https://so.gushiwen.org/gushi/sanbai.aspx'
      }
headers={
        'user-agent':UserAgent().random
        }
#所有超链接
def get_all_poem_link(urls):
    poem_links=[]
    for url in urls:
      response=requests.get(url,headers=headers)
      soup=BeautifulSoup(response.text,'lxml')#1.转成bs规则
      content=soup.find_all('div',class_='sons')[0]
      links=content.find_all('a')#2.找所有a标签
      for link in links:#3.取a中url
          poem_links.append('https://so.gushiwen.org/'+link['href'])
    return poem_links
#存结果
poem_list=[]
#诗或词的超链接   
def get_poem(url):
    response=requests.get(url,headers=headers)
    soup=BeautifulSoup(response.text,'lxml')
    
    poem=soup.find('div',class_='contson').text.strip()
    poem_list.append(poem)
    
if __name__ =='__main__':
    poem_links=get_all_poem_link(urls)
    #最大并发量10
    executor = ThreadPoolExecutor(max_workers=10)
    #提交每一个线程任务
    future_tasks=[executor.submit(get_poem,url) for url in poem_links]
    #等待所有的线程结束之后我的程序一次往下走
    wait(future_tasks,return_when=ALL_COMPLETED)
    for poem in poem_list:
        print(poem)
        with open('poem.txt','a',encoding='utf-8') as f:
            f.write(poem+'\n')
阅读原文
阅读 3623
123 显示电脑版