import json from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.action_chains import ActionChains import time from bs4 import BeautifulSoup import re from fake_useragent import UserAgent import pymysql connect = pymysql.connect(host="localhost", port=3306, user="root", passwd="123123",database="hotel",charset="utf8") cursor = connect.cursor() def save(cursor,page_source,pro,city): bs=BeautifulSoup(page_source,'html.parser') #print(bs) name='' if bs.find('div',class_='hotel-name') !=None: name=bs.find('div',class_='hotel-name').get_text() hotelId='' if bs.find('span',class_='code') !=None: hotelId=bs.find('span',class_='code').get_text() address='' if bs.find('div',class_='address f-r') !=None: address=bs.find('div',class_='address f-r').get_text() infos='' introduction='' phone='' if bs.find_all('p',class_='textCommon') !=None: infos=bs.find_all('p',class_='textCommon') for i in range(len(infos)): if i == len(infos)-1: phoneTemp=infos[i].get_text() phone=phoneTemp.split(':')[1] else: introduction+=infos[i].get_text() insertSql = """ insert into ehotel (pro,city,hotelId,name,address,introduction,phone) values ('{}','{}','{}','{}','{}','{}','{}') """.format(str(pro),str(city), str(hotelId), str(name), str(address), str(introduction), str(phone)) if str(hotelId) !='': cursor.execute(insertSql) connect.commit() print(name+'insert successfully') else: print('------got empty data,can not insert') # 获取谷歌驱动 option = Options() #option.add_argument('headless') option.add_argument('--proxy-server=http://127.0.0.1:2745') option.add_argument('user-agent="{}"'.format(UserAgent().random)) driver = webdriver.Chrome(executable_path='chromedriver', options=option) # 访问途牛网 driver.get("https://hotel.tuniu.com/list/602p0s0b0?cityName=%E5%B9%BF%E5%B7%9E") # 最大化 driver.maximize_window() # 睡眠3秒等待页面加载 time.sleep(3) def getData(): # 连接数据库 # 获取游标 # 建表操作在可视化提前建好即可,或者自行写代码创建 # 打开准备好的全部城市名字的文本文件 with open("a.txt",mode="r",encoding="utf-8") as file: # 将文本读取进来 text = file.read() # 用json解析文本文件 jsondata = json.loads(text) # 遍历解析出来的字典 pro就是key 省份 for pro in jsondata: tempList = jsondata[pro] # 通过key遍历values 这里遍历出来的就是city for city in tempList: # 通过切割得到后面中文的城市名 place = (str(city).split("|")[1]) # -----------------自动修改城市名进行跳转----------------------- # 清空一下输入城市那个标签的内容 driver.find_element_by_css_selector(".city-div > input:nth-child(1)").clear() # 将遍历出来的中文城市名填进去 driver.find_element_by_css_selector(".city-div > input:nth-child(1)").send_keys(place) time.sleep(2) # 点击一下提示框的第一个地点 就会自动跳转到那个城市 ActionChains(driver).move_by_offset(226, 263).click().perform() # 回到原点 ActionChains(driver).move_by_offset(-226, -263).perform() time.sleep(5) click_next(driver,pro,place) def judgeLen(temp): if len(temp) > 0: data = temp[0] else: data = "null" return data def click_next(driver,pro,place): ## 变量n表示爬取进度 n = 1 print('正在爬取', place, '酒店信息', '第%d页' % n) ## 假如该城市只有1页的酒店数据,下面的while循环不执行 page_source = driver.page_source if 'arrowR"' in page_source: flag = True else: flag = False while flag: if 'arrowR"' not in page_source: break next_elem = driver.find_element_by_class_name('arrowR') time.sleep(70) elements = driver.find_elements_by_css_selector('[class="detail-btn f-s"]') #the current page handle and its the main list page h1=driver.window_handles #print('h1=',h1) #print(elements) for element in elements: #print(element) element.click() time.sleep(80) h2=driver.window_handles #print("h2=",h2) #go to the detail page and its the content page driver.switch_to.window(h2[1]) currentPageUrl=driver.current_url driver.get(currentPageUrl) if '速度过快' in driver.title: print('--now in sleep for reason 访问速度过快--') time.sleep(4000) else: #print(driver.title) save(cursor,driver.page_source,pro,place) time.sleep(5) driver.close()#close current page driver.switch_to.window(h2[0])#back to main list page time.sleep(90) next_elem.click() n += 1 print('while正在爬取', place, '酒店信息', '第%d页' % n) print(place, '爬取完成', '共%d页' % n) if __name__ == '__main__': getData()
python应用selenium爬取途牛全国的酒店数据并插入数据库,考虑分页
阅读:3877 输入:2021-05-31 18:13:00