# -*- coding: utf-8 -*- import requests from fake_useragent import UserAgent from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor,wait,ALL_COMPLETED urls={ 'https://so.gushiwen.org/gushi/tangshi.aspx', 'https://so.gushiwen.org/gushi/songci.aspx', 'https://so.gushiwen.org/gushi/sanbai.aspx' } headers={ 'user-agent':UserAgent().random } #所有超链接 def get_all_poem_link(urls): poem_links=[] for url in urls: response=requests.get(url,headers=headers) soup=BeautifulSoup(response.text,'lxml')#1.转成bs规则 content=soup.find_all('div',class_='sons')[0] links=content.find_all('a')#2.找所有a标签 for link in links:#3.取a中url poem_links.append('https://so.gushiwen.org/'+link['href']) return poem_links #存结果 poem_list=[] #诗或词的超链接 def get_poem(url): response=requests.get(url,headers=headers) soup=BeautifulSoup(response.text,'lxml') poem=soup.find('div',class_='contson').text.strip() poem_list.append(poem) if __name__ =='__main__': poem_links=get_all_poem_link(urls) #最大并发量10 executor = ThreadPoolExecutor(max_workers=10) #提交每一个线程任务 future_tasks=[executor.submit(get_poem,url) for url in poem_links] #等待所有的线程结束之后我的程序一次往下走 wait(future_tasks,return_when=ALL_COMPLETED) for poem in poem_list: print(poem) with open('poem.txt','a',encoding='utf-8') as f: f.write(poem+'\n')
python多线程BeautifulSoup采集诗词网,设置爬多url
阅读:3622 输入:2020-05-07 15:50:03