主页 M

python综合应用BeautifulSoup、正则爬世界网址并生成文本

2021-03-29 网页编程网 网页编程网
import requests
from bs4 import BeautifulSoup
import smtplib
import re
import time
headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/82.0.4068.4 Safari/537.36'
        }
def weather(url):
    response=requests.get(url,headers=headers)
    response.encoding='gb18030'
    bs=BeautifulSoup(response.text,'html.parser')
    #print(bs)
    if len(response.text) < 600:
        return 'null'
    else:
        siteName=bs.find('title').get_text()
        print(siteName)
        position=bs.find(id='position')
        positionall=position.find_all("a")
        if len(positionall)==2:
            positionR1=positionall[1].text
            positionR2=''
        elif len(positionall)>2:
            positionR1=positionall[1].text
            positionR2=positionall[2].text
        typeTemp=re.findall("""&gt; (.*?) &gt; """,str(position))
        if len(typeTemp) is not None:
            type=typeTemp[0]
        else:
            type=''
        siteInfo=bs.find(id="siteinfo")
        #print(siteInfo)
        siteURL=siteInfo.find_all('a')
        print(siteURL)
        if len(siteURL)>3:
            URL1=siteURL[0].get('href')
            URL2=siteURL[2].get('href')
        elif len(siteURL)>0:
            URL1=siteURL[0].get('href')
            URL2=''
        else:
            URL1=''
            URL2=''

        desc=bs.find(id='sitetext')
        print(desc.text)
        if desc is None:
            descR=''
        else:
            descR=desc.text
        #print(desc.text)
        number=re.findall('/(.*?).html',url)
        with open('index2-2.txt','a',encoding='utf-8-sig') as f:
            f.write('insert into table_temp (number,site,type,siteHref,siteHref,list1,list2,desc) values ("{}","{}","{}","{}","{}","{}","{}","{}");'.format(number[1],siteName,type,URL1,URL2,positionR1,positionR2,descR))

if __name__ =='__main__': 
    for i in range(143,90000):
        time.sleep(5)
        url='http://www.k*g-u*o-w-ai.com/html/{}.html'.format(i)
        #url='http://www.k/g-u+o+w*a-i.com/html/55.html'
        print(url)
        weather(url)
阅读原文
阅读 3486
123 显示电脑版