python用BeautifulSoup爬小说网站，乱码处理与请求失败重试-网页编程网

当前位置：主页 >> Python 3 >> 正文

python用BeautifulSoup爬小说网站，乱码处理与请求失败重试

阅读：3881 输入：2020-05-11 15:15:49

# -*- coding: utf-8 -*-
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import time

urls={
      'http://www.biquge.info/86_86175/'
      }
headers={
        'user-agent':UserAgent().random
        }
#所有超链接
def get_all_poem_link(urls):
    links=[]
    for url in urls:
      response=requests.get(url,headers=headers)
      response.encoding#默认的解码格式
      soup=BeautifulSoup(response.text,'lxml')#1.转成bs规则
      dd_list=soup.find('div',id='list').find_all("dd")
      for dd in dd_list:
          urlc=dd.find('a')['href']#也可用get('href')
          print('http://www.biquge.info/86_86175/'+urlc)
          links.append('http://www.biquge.info/86_86175/'+urlc)
          
    return links
#存结果
poem_list=[]
#诗或词的超链接   
def get_poem(url):
    response=requests.get(url,headers=headers)
    if response.status_code==200:
        html=response.content.decode(response.apparent_encoding,'ignore')
        soup=BeautifulSoup(html,'lxml')
        content=soup.find('div',id='content').get_text()
        return content
    else:
        print('访问失败')
        return None
        
if __name__ =='__main__':
    result=""
    url_list=get_all_poem_link(urls)
    for i,url in enumerate(url_list):
        print('下载 第%d章'%(i+1))
        content=get_poem(url)
        if content:
            result+=content
        else:
            print('失败了，重新下载第%d章'%(i+1))
            time.sleep(2)
            content=get_poem(url)
            result+=content
    
    with open('ss.txt','w',encoding='utf-8') as f:
        f.write(result)

上一篇：python应用BeautifulSoup爬汽车之家面向对象程序
下一篇：python正则爬淘宝搜索数据

相关阅读: python应用pyquery采集新浪列表页内容页，需数据解码; python正则采集百度瀑布流图片并保存在本地; Python爬虫，应用fontTools解密已加密字体