import time from selenium import webdriver import re option = webdriver.ChromeOptions() option.add_argument('headless') driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=option) for j in range(0,67892):#11602 url = 'http://liaotian.puaas.com/{}.html'.format(j) #拿到响应信息,提取目标信息 driver.get(url) page_src = driver.page_source print('-----'+str(len(page_src))+'byte,page number:'+str(j)+'\n') if '您访问的页面不存' in page_src: print('not found page',j) elif len(page_src)>50000: print(str(j)+' page number \n') titles=driver.find_elements_by_tag_name('h1') for _title in titles: title=_title.text print(title) __type=re.findall(r' » 0: _type=re.findall(r'>(.*?)<', __type[0]) print(_type[0]) listName=_type[0] #print(page_src) page_src_noBRSPACE=re.sub(r'[\s]+', '', page_src) #page=re.findall(r"(.*?)(.*?)','',' ','',""] if len(page)>0: _page=page[0].replace('','').replace(' ','').replace(r' ','') #for kill in kills: # _page=page[0].replace(kill,'') _text = re.sub(r' ','',_page) __text = re.sub(r' ','',_text) ___text = re.sub(r' 以上,就是关于(.*?)
','',__text) ____text = re.sub(r'学习更多专业恋爱(.*?)
','',___text) _____text = re.sub(r'
','',____text) ______text = re.sub(r'
','',____text) text = re.sub(r' 学习(.*?)
','',_____text) #print(_page) text=text.replace('olstart','ol start') with open('sql.txt','a',encoding='utf-8') as f: f.write("('{}','{}','{}','{}'),".format(title,listName,j,text))