python用BeautifulSoup爬小说网站，乱码处理与请求失败重试

2020-05-11 网页编程网 网页编程网
# -*- coding: utf-8 -*-
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import time

urls={
      'http://www.biquge.info/86_86175/'
      }
headers={
        'user-agent':UserAgent().random
        }
#所有超链接
def get_all_poem_link(urls):
    links=[]
    for url in urls:
      response=requests.get(url,headers=headers)
      response.encoding#默认的解码格式
      soup=BeautifulSoup(response.text,'lxml')#1.转成bs规则
      dd_list=soup.find('div',id='list').find_all("dd")
      for dd in dd_list:
          urlc=dd.find('a')['href']#也可用get('href')
          print('http://www.biquge.info/86_86175/'+urlc)
          links.append('http://www.biquge.info/86_86175/'+urlc)
          
    return links
#存结果
poem_list=[]
#诗或词的超链接   
def get_poem(url):
    response=requests.get(url,headers=headers)
    if response.status_code==200:
        html=response.content.decode(response.apparent_encoding,'ignore')
        soup=BeautifulSoup(html,'lxml')
        content=soup.find('div',id='content').get_text()
        return content
    else:
        print('访问失败')
        return None
        
if __name__ =='__main__':
    result=""
    url_list=get_all_poem_link(urls)
    for i,url in enumerate(url_list):
        print('下载 第%d章'%(i+1))
        content=get_poem(url)
        if content:
            result+=content
        else:
            print('失败了，重新下载第%d章'%(i+1))
            time.sleep(2)
            content=get_poem(url)
            result+=content
    
    with open('ss.txt','w',encoding='utf-8') as f:
        f.write(result)
阅读原文
阅读 3877
123 显示电脑版