# -*- coding: utf-8 -*-
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor,wait,ALL_COMPLETED
urls={
'https://so.gushiwen.org/gushi/tangshi.aspx',
'https://so.gushiwen.org/gushi/songci.aspx',
'https://so.gushiwen.org/gushi/sanbai.aspx'
}
headers={
'user-agent':UserAgent().random
}
#所有超链接
def get_all_poem_link(urls):
poem_links=[]
for url in urls:
response=requests.get(url,headers=headers)
soup=BeautifulSoup(response.text,'lxml')#1.转成bs规则
content=soup.find_all('div',class_='sons')[0]
links=content.find_all('a')#2.找所有a标签
for link in links:#3.取a中url
poem_links.append('https://so.gushiwen.org/'+link['href'])
return poem_links
#存结果
poem_list=[]
#诗或词的超链接
def get_poem(url):
response=requests.get(url,headers=headers)
soup=BeautifulSoup(response.text,'lxml')
poem=soup.find('div',class_='contson').text.strip()
poem_list.append(poem)
if __name__ =='__main__':
poem_links=get_all_poem_link(urls)
#最大并发量10
executor = ThreadPoolExecutor(max_workers=10)
#提交每一个线程任务
future_tasks=[executor.submit(get_poem,url) for url in poem_links]
#等待所有的线程结束之后我的程序一次往下走
wait(future_tasks,return_when=ALL_COMPLETED)
for poem in poem_list:
print(poem)
with open('poem.txt','a',encoding='utf-8') as f:
f.write(poem+'\n')