主页 M

采集案例

2023-02-09 网页编程网 网页编程网
import time
from selenium import webdriver
import re

option = webdriver.ChromeOptions()
option.add_argument('headless')
driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=option)


for j in range(0,67892):#11602
    url = 'http://liaotian.puaas.com/{}.html'.format(j)
    #拿到响应信息,提取目标信息
    driver.get(url)
    page_src = driver.page_source
    print('-----'+str(len(page_src))+'byte,page number:'+str(j)+'\n')
    
    if '您访问的页面不存' in page_src:
        print('not found page',j)
    elif len(page_src)>50000:
        print(str(j)+' page number \n')
        titles=driver.find_elements_by_tag_name('h1')
        for _title in titles:
          title=_title.text
          print(title)

        __type=re.findall(r' » 0:
          _type=re.findall(r'>(.*?)<', __type[0])
          print(_type[0])
          listName=_type[0]
        #print(page_src)
        page_src_noBRSPACE=re.sub(r'[\s]+', '', page_src)
        #page=re.findall(r"(.*?)(.*?)','','','',""]
        if len(page)>0:
          
          _page=page[0].replace('','').replace('','').replace(r'','')
          #for kill in kills:
          #  _page=page[0].replace(kill,'')

          _text = re.sub(r'','',_page)
          __text = re.sub(r'','',_text)
          ___text = re.sub(r'

以上,就是关于(.*?)

','',__text)           ____text = re.sub(r'

学习更多专业恋爱(.*?)

','',___text)           _____text = re.sub(r'

','',____text)           ______text = re.sub(r'

','',____text)           text = re.sub(r'

学习(.*?)

','',_____text)           #print(_page)           text=text.replace('olstart','ol start')           with open('sql.txt','a',encoding='utf-8') as f:               f.write("('{}','{}','{}','{}'),".format(title,listName,j,text))      
阅读原文
阅读 1579
123 显示电脑版