主页 M

python应用selenium爬取途牛全国的酒店数据并插入数据库,考虑分页

2021-05-31 网页编程网 网页编程网
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
from bs4 import BeautifulSoup
import re
from fake_useragent import UserAgent
import pymysql
connect = pymysql.connect(host="localhost", port=3306, user="root", passwd="123123",database="hotel",charset="utf8")
cursor = connect.cursor()


def save(cursor,page_source,pro,city):
    bs=BeautifulSoup(page_source,'html.parser')
    #print(bs)
    name=''
    if bs.find('div',class_='hotel-name') !=None:
        name=bs.find('div',class_='hotel-name').get_text()
    hotelId=''
    if bs.find('span',class_='code') !=None:
        hotelId=bs.find('span',class_='code').get_text()
    address=''
    if bs.find('div',class_='address f-r') !=None:
        address=bs.find('div',class_='address f-r').get_text()
    infos=''
    introduction=''
    phone=''
    if bs.find_all('p',class_='textCommon') !=None:
        infos=bs.find_all('p',class_='textCommon')
        for i in range(len(infos)):
            if i == len(infos)-1:
                phoneTemp=infos[i].get_text()
                phone=phoneTemp.split(':')[1]
            else:
                introduction+=infos[i].get_text()
    
    insertSql = """
                insert into ehotel (pro,city,hotelId,name,address,introduction,phone) values
                ('{}','{}','{}','{}','{}','{}','{}')
                """.format(str(pro),str(city), str(hotelId), str(name), str(address), str(introduction), str(phone))
    
    if str(hotelId) !='':
        cursor.execute(insertSql)
        connect.commit()
        print(name+'insert successfully')
    else:
        print('------got empty data,can not insert')


# 获取谷歌驱动
option = Options()
#option.add_argument('headless')
option.add_argument('--proxy-server=http://127.0.0.1:2745')
option.add_argument('user-agent="{}"'.format(UserAgent().random))
driver = webdriver.Chrome(executable_path='chromedriver', options=option)
# 访问途牛网
driver.get("https://hotel.tuniu.com/list/602p0s0b0?cityName=%E5%B9%BF%E5%B7%9E")
# 最大化
driver.maximize_window()
# 睡眠3秒等待页面加载
time.sleep(3)


def getData():
    # 连接数据库
    # 获取游标
    # 建表操作在可视化提前建好即可,或者自行写代码创建
    # 打开准备好的全部城市名字的文本文件
    with open("a.txt",mode="r",encoding="utf-8") as file:
        # 将文本读取进来
        text = file.read()
        # 用json解析文本文件
        jsondata = json.loads(text)
        # 遍历解析出来的字典 pro就是key 省份
        for pro in jsondata:
            tempList = jsondata[pro]
            # 通过key遍历values 这里遍历出来的就是city
            for city in tempList:
                # 通过切割得到后面中文的城市名
                place = (str(city).split("|")[1])

                # -----------------自动修改城市名进行跳转-----------------------
                # 清空一下输入城市那个标签的内容
                driver.find_element_by_css_selector(".city-div > input:nth-child(1)").clear()
                # 将遍历出来的中文城市名填进去
                driver.find_element_by_css_selector(".city-div > input:nth-child(1)").send_keys(place)
                time.sleep(2)
                # 点击一下提示框的第一个地点 就会自动跳转到那个城市
                ActionChains(driver).move_by_offset(226, 263).click().perform()
                # 回到原点
                ActionChains(driver).move_by_offset(-226, -263).perform()
                time.sleep(5)
                click_next(driver,pro,place)


def judgeLen(temp):
    if len(temp) > 0:
        data = temp[0]
    else:
        data = "null"
    return data

def click_next(driver,pro,place):
    ## 变量n表示爬取进度
    n = 1
    print('正在爬取', place, '酒店信息', '第%d页' % n)
    ## 假如该城市只有1页的酒店数据,下面的while循环不执行
    page_source = driver.page_source
    if 'arrowR"' in page_source:
        flag = True
    else:
        flag = False
    while flag:
        if 'arrowR"' not in page_source:
            break
        next_elem = driver.find_element_by_class_name('arrowR')

        time.sleep(70)
        elements = driver.find_elements_by_css_selector('[class="detail-btn f-s"]')

        #the current page handle and its the main list page
        h1=driver.window_handles
        #print('h1=',h1)
        #print(elements)
        for element in elements:
            #print(element)
            element.click()
            time.sleep(80)
            h2=driver.window_handles
            #print("h2=",h2)
            #go to the detail page and its the content page
            driver.switch_to.window(h2[1])
            currentPageUrl=driver.current_url
            driver.get(currentPageUrl)
            if '速度过快' in driver.title:
                print('--now in sleep for reason 访问速度过快--')
                time.sleep(4000)
            else:
                #print(driver.title)
                save(cursor,driver.page_source,pro,place)
            time.sleep(5)
            driver.close()#close current page
            driver.switch_to.window(h2[0])#back to main list page
        time.sleep(90)
        next_elem.click()
        n += 1
        print('while正在爬取', place, '酒店信息', '第%d页' % n)
    print(place, '爬取完成', '共%d页' % n)
if __name__ == '__main__':
    getData()
阅读原文
阅读 3876
123 显示电脑版