采集案例

2023-02-09 网页编程网 网页编程网
import time
from selenium import webdriver
import re

option = webdriver.ChromeOptions()
option.add_argument('headless')
driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=option)


for j in range(0,67892):#11602
    url = 'http://liaotian.puaas.com/{}.html'.format(j)
    #拿到响应信息，提取目标信息
    driver.get(url)
    page_src = driver.page_source
    print('-----'+str(len(page_src))+'byte,page number:'+str(j)+'\n')
    
    if '您访问的页面不存' in page_src:
        print('not found page',j)
    elif len(page_src)>50000:
        print(str(j)+' page number \n')
        titles=driver.find_elements_by_tag_name('h1')
        for _title in titles:
          title=_title.text
          print(title)

        __type=re.findall(r' » 0:
          _type=re.findall(r'>(.*?)<', __type[0])
          print(_type[0])
          listName=_type[0]
        #print(page_src)
        page_src_noBRSPACE=re.sub(r'[\s]+', '', page_src)
        #page=re.findall(r"(.*?)(.*?)','','','',""]
        if len(page)>0:
          
          _page=page[0].replace('','').replace('','').replace(r'','')
          #for kill in kills:
          #  _page=page[0].replace(kill,'')

          _text = re.sub(r'','',_page)
          __text = re.sub(r'','',_text)
          ___text = re.sub(r'以上，就是关于(.*?)','',__text)
          ____text = re.sub(r'学习更多专业恋爱(.*?)','',___text)
          _____text = re.sub(r'','',____text)
          ______text = re.sub(r'
','',____text)
          text = re.sub(r'
学习(.*?)','',_____text)
          #print(_page)
          text=text.replace('olstart','ol start')
          with open('sql.txt','a',encoding='utf-8') as f:
              f.write("('{}','{}','{}','{}'),".format(title,listName,j,text))
阅读原文
阅读 3071
123 显示电脑版