主页 M

python应用selenium爬取途牛全国的酒店数据并插入数据库,未考虑分页

2021-05-31 网页编程网 网页编程网

AllCity.txt

{
    "北京": ["bj|北京"],

    "天津": ["tj|天津"],

    "上海": ["sh|上海"],

    "台湾": ["tw|台湾"],

    "香港": ["hk|香港"],

    "澳门": ["am|澳门"],

    "河北": ["bd|保定", "cangzhou|沧州", "chengde|承德", "dingzhou|定州", "gt|馆陶", "hd|邯郸", "hs|衡水", "lf|廊坊", "qhd|秦皇岛", "sjz|石家庄", "ts|唐山", "xt|邢台", "zjk|张家口", "zd|正定", "zx|赵县", "zhangbei|张北"],

    "河南": ["ay|安阳", "changge|长葛", "hb|鹤壁", "jiaozuo|焦作", "jiyuan|济源", "kaifeng|开封", "luoyang|洛阳", "luohe|漯河", "mg|明港", "ny|南阳", "pds|平顶山", "puyang|濮阳", "sq|商丘", "smx|三门峡", "xx|新乡", "xc|许昌", "xy|信阳", "yuzhou|禹州", "yanling|鄢陵", "zz|郑州", "zk|周口", "zmd|驻马店"],

    "黑龙江": ["dq|大庆","dxal|大兴安岭", "hrb|哈尔滨", "hegang|鹤岗", "heihe|黑河", "jms|佳木斯", "jixi|鸡西", "mdj|牡丹江", "qqhr|齐齐哈尔", "qth|七台河", "suihua|绥化", "sys|双鸭山", "yich|伊春"],

    "吉林": ["bc|白城", "baishan|白山", "cc|长春", "jl|吉林", "liaoyuan|辽源", "songyuan|松原", "sp|四平", "th|通化", "yanbian|延边"],

    "辽宁" : ["as|鞍山", "benxi|本溪", "cy|朝阳", "dl|大连", "dandong|丹东", "fushun|抚顺", "fx|阜新", "hld|葫芦岛", "jinzhou|锦州", "liaoyang|辽阳", "pj|盘锦", "sy|沈阳", "tl|铁岭", "wfd|瓦房店", "yk|营口", "pld|庄河"],

    "山东": ["bz|滨州", "dz|德州", "dy|东营", "heze|菏泽", "jn|济南", "jining|济宁", "kl|垦利", "linyi|临沂", "lc|聊城", "lw|莱芜", "qd|青岛", "rizhao|日照", "shouguang|寿光", "longkou|龙口", "ta|泰安", "wf|潍坊", "weihai|威海", "yt|烟台", "zb|淄博", "zaozhuang|枣庄", "zhangqiu|章丘", "zc|诸城"],

    "内蒙古": ["alsm|阿拉善盟", "bt|包头", "bycem|巴彦淖尔", "chifeng|赤峰", "erds|鄂尔多斯", "hu|呼和浩特", "hlbe|呼伦贝尔", "hlr|海拉尔", "tongliao|通辽", "wuhai|乌海", "wlcb|乌兰察布", "xl|锡林郭勒", "xam|兴安盟"],

    "江苏": ["cz|常州", "dafeng|大丰", "danyang|丹阳", "dongtai|东台", "donghai|东海", "ha|淮安", "haimen|海门", "haian|海安", "jingjiang|靖江", "jianhu|建湖", "liyang|溧阳", "lyg|连云港", "nj|南京", "nt|南通", "pizhou|邳州", "qidong|启东", "rugao|如皋", "rudong|如东", "su|苏州", "shuyang|沭阳", "suqian|宿迁", "taizhou|泰州", "taixing|泰兴", "wx|无锡", "xinghuashi|兴化", "xinyishi|新沂", "xz|徐州", "xzpeixian|沛县", "yangzhong|扬中", "yz|扬州", "yancheng|盐城", "zj|镇江"],

    "安徽": ["anqing|安庆", "bengbu|蚌埠", "bozhou|亳州", "ch|巢湖", "chizhou|池州", "chuzhou|滁州", "fy|阜阳", "hf|合肥", "hn|淮南", "huaibei|淮北", "huangshan|黄山", "hexian|和县", "hq|霍邱", "la|六安", "mas|马鞍山", "ningguo|宁国", "suzhou|宿州", "tianchang|天长", "tongling|铜陵", "tongcheng|桐城", "wuhu|芜湖", "xuancheng|宣城"],

    "山西": ["changzhi|长治", "dt|大同", "jincheng|晋城", "jz|晋中", "lvliang|吕梁", "linfen|临汾", "linyixian|临猗", "qingxu|清徐", "shuozhou|朔州", "ty|太原", "xinzhou|忻州", "yuncheng|运城", "yq|阳泉"],

    "陕西": ["ankang|安康", "baoji|宝鸡", "hanzhong|汉中", "sl|商洛", "tc|铜川", "wn|渭南", "xa|西安", "xianyang|咸阳", "yanan|延安", "yl|榆林"],

    "甘肃": ["by|白银", "dx|定西", "gn|甘南", "jinchang|金昌", "jyg|嘉峪关", "jq|酒泉", "lz|兰州", "linxia|临夏", "ln|陇南", "pl|平凉", "qingyang|庆阳", "tianshui|天水", "wuwei|武威", "zhangye|张掖"],

    "浙江": ["hz|杭州", "cixi|慈溪", "changxing|长兴", "deqing|德清", "dongyang|东阳", "haining|海宁", "huzhou|湖州", "jiashanx|嘉善", "jx|嘉兴", "jh|金华", "lishui|丽水", "nb|宁波", "quzhou|衢州", "ruiancity|瑞安", "sx|绍兴", "tongxiang|桐乡", "tz|台州", "wenling|温岭", "wz|温州", "xiangshanxian|象山", "yiwu|义乌", "yueqingcity|乐清", "yuyao|余姚", "zhoushan|舟山", "zhuji|诸暨"],

    "江西": ["fuzhou|抚州", "ganzhou|赣州", "jj|九江", "ja|吉安", "jdz|景德镇", "nc|南昌", "px|萍乡", "sr|上饶", "xinyu|新余", "yingtan|鹰潭", "yichun|宜春", "yxx|永新"],

    "湖北": ["es|恩施", "ez|鄂州", "hshi|黄石", "hg|黄冈", "jingzhou|荆州", "jingmen|荆门", "qianjiang|潜江", "shiyan|十堰", "snj|神农架", "suizhou|随州", "tm|天门", "wh|武汉", "xf|襄阳", "xiaogan|孝感", "xiantao|仙桃", "xianning|咸宁", "yc|宜昌", "yidou|宜都"],

    "湖南": ["cs|长沙", "changde|常德", "chenzhou|郴州", "hy|衡阳", "hh|怀化", "ld|娄底", "shaoyang|邵阳", "xiangtan|湘潭", "xiangxi|湘西", "yy|岳阳", "yongzhou|永州", "yiyang|益阳", "zhuzhou|株洲", "zjj|张家界"],

    "贵州": ["anshun|安顺", "bijie|毕节", "gy|贵阳", "lps|六盘水", "qdn|黔东南", "qn|黔南", "qxn|黔西南", "tr|铜仁", "zunyi|遵义"],

    "四川": ["ab|阿坝", "bazhong|巴中", "cd|成都", "deyang|德阳", "dazhou|达州", "ga|广安", "guangyuan|广元", "ganzi|甘孜", "ls|乐山", "luzhou|泸州", "liangshan|凉山", "mianyang|绵阳", "ms|眉山", "scnj|内江", "nanchong|南充", "panzhihua|攀枝花", "suining|遂宁", "yb|宜宾", "ya|雅安", "zg|自贡", "zy|资阳"],

    "云南": ["bs|保山", "cx|楚雄", "dali|大理", "diqing|迪庆", "dh|德宏", "honghe|红河", "km|昆明", "lj|丽江", "lincang|临沧", "nujiang|怒江", "pe|普洱", "qj|曲靖", "ws|文山", "bn|西双版纳", "yx|玉溪", "zt|昭通"],

    "新疆":  ["aks|阿克苏", "ale|阿拉尔", "bygl|巴音郭楞", "betl|博尔塔拉", "changji|昌吉", "hami|哈密", "ht|和田", "klmy|克拉玛依", "kel|库尔勒", "ks|喀什", "kzls|克孜勒苏", "shz|石河子", "tlf|吐鲁番", "tmsk|图木舒克", "xj|乌鲁木齐", "wjq|五家渠", "yili|伊犁", "alt|阿勒泰", "tac|塔城"],

    "宁夏": ["guyuan|固原", "szs|石嘴山", "wuzhong|吴忠", "yinchuan|银川", "zw|中卫"],
    "青海":  ["guoluo|果洛", "huangnan|黄南", "hx|海西", "haidong|海东", "haibei|海北", "hainan|海南", "xn|西宁", "ys|玉树"],
    "西藏": ["al|阿里", "changdu|昌都", "lasa|拉萨", "linzhi|林芝", "nq|那曲", "rkz|日喀则", "sn|山南", "rituxian|日土", "gaizexian|改则"],

    "广西": ["baise|百色", "bh|北海", "chongzuo|崇左", "fcg|防城港", "gl|桂林", "gg|贵港", "hc|河池", "hezhou|贺州", "liuzhou|柳州", "lb|来宾", "nn|南宁", "qinzhou|钦州", "wuzhou|梧州", "yulin|玉林"],

    "广东": ["chaozhou|潮州", "dg|东莞", "fs|佛山", "gz|广州", "huidong|惠东", "huizhou|惠州", "heyuan|河源", "jm|江门", "jy|揭阳", "mm|茂名", "mz|梅州", "qingyuan|清远", "sd|顺德", "sz|深圳", "st|汕头", "sg|韶关", "sw|汕尾", "taishan|台山", "yj|阳江", "yangchun|阳春", "yf|云浮", "zh|珠海", "zs|中山", "zhanjiang|湛江", "zq|肇庆", "boluo|博罗"],

    "福建": ["fz|福州", "jinjiangshi|晋江", "ly|龙岩", "nd|宁德", "np|南平", "nananshi|南安", "pt|莆田", "qz|泉州", "sm|三明", "shishi|石狮", "wuyishan|武夷山", "xm|厦门", "zhangzhou|漳州"],

    "海南": ["haikou|海口", "sansha|三沙", "sanya|三亚", "wzs|五指山", "qh|琼海", "wenchang|文昌", "wanning|万宁", "tunchang|屯昌", "qiongzhong|琼中", "lingshui|陵水", "df|东方", "da|定安", "cm|澄迈", "baoting|保亭", "baish|白沙", "tanzhou|儋州"]
}

hotel.py

import json
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time
from bs4 import BeautifulSoup
import re
import pymysql

# 获取谷歌驱动
driver = webdriver.Chrome("chromedriver.exe")
# 访问途牛网
driver.get("https://hotel.tuniu.com/list/602p0s0b0?cityName=%E5%B9%BF%E5%B7%9E")
# 最大化
driver.maximize_window()
# 睡眠3秒等待页面加载
time.sleep(3)

# 判断一下数据为不为空 为空就将字符串"null"返回去
def judgeLen(temp):
    if len(temp) > 0:
        data = temp[0]
    else:
        data = "null"
    return data

def getData():
    # 连接数据库
    connect = pymysql.connect(host="xxxxx", port=12345, user="xxx", passwd="xxxx",database="mydata",charset="utf8")
    # 获取游标
    cursor = connect.cursor()

    # 建表操作在可视化提前建好即可,或者自行写代码创建
    
    # 打开准备好的全部城市名字的文本文件
    with open("AllCity.txt",mode="r",encoding="utf-8") as file:
        # 将文本读取进来
        text = file.read()
        # 用json解析文本文件
        jsondata = json.loads(text)
        # 遍历解析出来的字典 pro就是key 省份
        for pro in jsondata:
            tempList = jsondata[pro]
            # 通过key遍历values 这里遍历出来的就是city
            for city in tempList:
                # 通过切割得到后面中文的城市名
                place = (str(city).split("|")[1])

                # -----------------自动修改城市名进行跳转-----------------------
                # 清空一下输入城市那个标签的内容
                driver.find_element_by_css_selector(".city-div > input:nth-child(1)").clear()
                # 将遍历出来的中文城市名填进去
                driver.find_element_by_css_selector(".city-div > input:nth-child(1)").send_keys(place)
                time.sleep(2)
                # 点击一下提示框的第一个地点 就会自动跳转到那个城市
                ActionChains(driver).move_by_offset(226, 263).click().perform()
                # 回到原点
                ActionChains(driver).move_by_offset(-226, -263).perform()
                time.sleep(5)

                # 对驱动返回的页面内容进行解析
                bs = BeautifulSoup(driver.page_source, "html.parser")
                # 获取每个酒店div标签
                data = bs.find_all("div", class_="hotel-item")
                # 遍历div标签
                for div in data:
                    # 正则表达式获取每个数据
                    # 酒店名
                    namepatt = re.compile(r'span.*?hotel-name f-m.*?>(.*?)')
                    # 钻石图标,当做星星用了
                    diapatt = re.compile(r'(icon icon-diamond)')
                    # 星星
                    starpatt = re.compile(r'(icon icon-star)')
                    # 评分
                    ratingpatt = re.compile(
                        r'"hotel-score f-b f-DINA" data-v-74d0f10f="" style="background: rgb.*?;">(.*?)(.*?)条评论')
                    # 价格
                    pricepatt = re.compile(
                        r'(.*?) 0:
                        # 酒店名
                        hname = name.split("(")[1][:-1]
                        # 品牌
                        hbrand = name.split("(")[0]
                    else:
                        # 酒店名
                        hname = name
                        # 品牌
                        hbrand = "其他"
                    # 匹配星级 通过星星标签数量
                    if len(re.findall(diapatt, str(div))) > 0:
                        star = str(len(re.findall(diapatt, str(div)))) + "星"
                    else:
                        star = str(len(re.findall(starpatt, str(div)))) + "星"
                    # 评分
                    rating = judgeLen(re.findall(ratingpatt, str(div)))
                    # 评论数
                    comm = judgeLen(re.findall(commpatt, str(div)))
                    # 价格
                    price = judgeLen(re.findall(pricepatt, str(div)))
                    # 往数据库插入数据
                    insertSql = """
                                insert into `TC_hotel` (hname,hbrand,province,city,starlevel,rating,comment_count,price)values
                                ('{}','{}','{}','{}','{}','{}','{}','{}')
                                """.format(str(hname), str(hbrand), str(pro), str(place), str(star), str(rating), str(comm), str(price))
                    # 预编译sql语句
                    cursor.execute(insertSql)
                    # 提交
                    connect.commit()
                    # 打印插入信息
                    print("插入数据   "+str(pro), str(place), str(hname), str(hbrand), str(star), str(rating), str(comm), str(price))
if __name__ == '__main__':
    getData()
阅读原文
阅读 4250
123 显示电脑版