import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
from bs4 import BeautifulSoup
import re
from fake_useragent import UserAgent
import pymysql
connect = pymysql.connect(host="localhost", port=3306, user="root", passwd="123123",database="hotel",charset="utf8")
cursor = connect.cursor()
def save(cursor,page_source,pro,city):
bs=BeautifulSoup(page_source,'html.parser')
#print(bs)
name=''
if bs.find('div',class_='hotel-name') !=None:
name=bs.find('div',class_='hotel-name').get_text()
hotelId=''
if bs.find('span',class_='code') !=None:
hotelId=bs.find('span',class_='code').get_text()
address=''
if bs.find('div',class_='address f-r') !=None:
address=bs.find('div',class_='address f-r').get_text()
infos=''
introduction=''
phone=''
if bs.find_all('p',class_='textCommon') !=None:
infos=bs.find_all('p',class_='textCommon')
for i in range(len(infos)):
if i == len(infos)-1:
phoneTemp=infos[i].get_text()
phone=phoneTemp.split(':')[1]
else:
introduction+=infos[i].get_text()
insertSql = """
insert into ehotel (pro,city,hotelId,name,address,introduction,phone) values
('{}','{}','{}','{}','{}','{}','{}')
""".format(str(pro),str(city), str(hotelId), str(name), str(address), str(introduction), str(phone))
if str(hotelId) !='':
cursor.execute(insertSql)
connect.commit()
print(name+'insert successfully')
else:
print('------got empty data,can not insert')
# 获取谷歌驱动
option = Options()
#option.add_argument('headless')
option.add_argument('--proxy-server=http://127.0.0.1:2745')
option.add_argument('user-agent="{}"'.format(UserAgent().random))
driver = webdriver.Chrome(executable_path='chromedriver', options=option)
# 访问途牛网
driver.get("https://hotel.tuniu.com/list/602p0s0b0?cityName=%E5%B9%BF%E5%B7%9E")
# 最大化
driver.maximize_window()
# 睡眠3秒等待页面加载
time.sleep(3)
def getData():
# 连接数据库
# 获取游标
# 建表操作在可视化提前建好即可,或者自行写代码创建
# 打开准备好的全部城市名字的文本文件
with open("a.txt",mode="r",encoding="utf-8") as file:
# 将文本读取进来
text = file.read()
# 用json解析文本文件
jsondata = json.loads(text)
# 遍历解析出来的字典 pro就是key 省份
for pro in jsondata:
tempList = jsondata[pro]
# 通过key遍历values 这里遍历出来的就是city
for city in tempList:
# 通过切割得到后面中文的城市名
place = (str(city).split("|")[1])
# -----------------自动修改城市名进行跳转-----------------------
# 清空一下输入城市那个标签的内容
driver.find_element_by_css_selector(".city-div > input:nth-child(1)").clear()
# 将遍历出来的中文城市名填进去
driver.find_element_by_css_selector(".city-div > input:nth-child(1)").send_keys(place)
time.sleep(2)
# 点击一下提示框的第一个地点 就会自动跳转到那个城市
ActionChains(driver).move_by_offset(226, 263).click().perform()
# 回到原点
ActionChains(driver).move_by_offset(-226, -263).perform()
time.sleep(5)
click_next(driver,pro,place)
def judgeLen(temp):
if len(temp) > 0:
data = temp[0]
else:
data = "null"
return data
def click_next(driver,pro,place):
## 变量n表示爬取进度
n = 1
print('正在爬取', place, '酒店信息', '第%d页' % n)
## 假如该城市只有1页的酒店数据,下面的while循环不执行
page_source = driver.page_source
if 'arrowR"' in page_source:
flag = True
else:
flag = False
while flag:
if 'arrowR"' not in page_source:
break
next_elem = driver.find_element_by_class_name('arrowR')
time.sleep(70)
elements = driver.find_elements_by_css_selector('[class="detail-btn f-s"]')
#the current page handle and its the main list page
h1=driver.window_handles
#print('h1=',h1)
#print(elements)
for element in elements:
#print(element)
element.click()
time.sleep(80)
h2=driver.window_handles
#print("h2=",h2)
#go to the detail page and its the content page
driver.switch_to.window(h2[1])
currentPageUrl=driver.current_url
driver.get(currentPageUrl)
if '速度过快' in driver.title:
print('--now in sleep for reason 访问速度过快--')
time.sleep(4000)
else:
#print(driver.title)
save(cursor,driver.page_source,pro,place)
time.sleep(5)
driver.close()#close current page
driver.switch_to.window(h2[0])#back to main list page
time.sleep(90)
next_elem.click()
n += 1
print('while正在爬取', place, '酒店信息', '第%d页' % n)
print(place, '爬取完成', '共%d页' % n)
if __name__ == '__main__':
getData()python应用selenium爬取途牛全国的酒店数据并插入数据库,考虑分页
阅读:4794 输入:2021-05-31 18:13:00