import requests from bs4 import BeautifulSoup import smtplib import re import time headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/82.0.4068.4 Safari/537.36' } def weather(url): response=requests.get(url,headers=headers) response.encoding='gb18030' bs=BeautifulSoup(response.text,'html.parser') #print(bs) if len(response.text) < 600: return 'null' else: siteName=bs.find('title').get_text() print(siteName) position=bs.find(id='position') positionall=position.find_all("a") if len(positionall)==2: positionR1=positionall[1].text positionR2='' elif len(positionall)>2: positionR1=positionall[1].text positionR2=positionall[2].text typeTemp=re.findall("""> (.*?) > """,str(position)) if len(typeTemp) is not None: type=typeTemp[0] else: type='' siteInfo=bs.find(id="siteinfo") #print(siteInfo) siteURL=siteInfo.find_all('a') print(siteURL) if len(siteURL)>3: URL1=siteURL[0].get('href') URL2=siteURL[2].get('href') elif len(siteURL)>0: URL1=siteURL[0].get('href') URL2='' else: URL1='' URL2='' desc=bs.find(id='sitetext') print(desc.text) if desc is None: descR='' else: descR=desc.text #print(desc.text) number=re.findall('/(.*?).html',url) with open('index2-2.txt','a',encoding='utf-8-sig') as f: f.write('insert into table_temp (number,site,type,siteHref,siteHref,list1,list2,desc) values ("{}","{}","{}","{}","{}","{}","{}","{}");'.format(number[1],siteName,type,URL1,URL2,positionR1,positionR2,descR)) if __name__ =='__main__': for i in range(143,90000): time.sleep(5) url='http://www.k*g-u*o-w-ai.com/html/{}.html'.format(i) #url='http://www.k/g-u+o+w*a-i.com/html/55.html' print(url) weather(url)
python综合应用BeautifulSoup、正则爬世界网址并生成文本
阅读:3483 输入:2021-03-29 08:34:51