# -*- coding: utf-8 -*- import requests from fake_useragent import UserAgent from bs4 import BeautifulSoup import time urls={ 'http://www.biquge.info/86_86175/' } headers={ 'user-agent':UserAgent().random } #所有超链接 def get_all_poem_link(urls): links=[] for url in urls: response=requests.get(url,headers=headers) response.encoding#默认的解码格式 soup=BeautifulSoup(response.text,'lxml')#1.转成bs规则 dd_list=soup.find('div',id='list').find_all("dd") for dd in dd_list: urlc=dd.find('a')['href']#也可用get('href') print('http://www.biquge.info/86_86175/'+urlc) links.append('http://www.biquge.info/86_86175/'+urlc) return links #存结果 poem_list=[] #诗或词的超链接 def get_poem(url): response=requests.get(url,headers=headers) if response.status_code==200: html=response.content.decode(response.apparent_encoding,'ignore') soup=BeautifulSoup(html,'lxml') content=soup.find('div',id='content').get_text() return content else: print('访问失败') return None if __name__ =='__main__': result="" url_list=get_all_poem_link(urls) for i,url in enumerate(url_list): print('下载 第%d章'%(i+1)) content=get_poem(url) if content: result+=content else: print('失败了,重新下载第%d章'%(i+1)) time.sleep(2) content=get_poem(url) result+=content with open('ss.txt','w',encoding='utf-8') as f: f.write(result)
python用BeautifulSoup爬小说网站,乱码处理与请求失败重试
阅读:3487 输入:2020-05-11 15:15:49