from selenium import webdriver import time from lxml import etree def get_driver(): driver = webdriver.Chrome() return driver def get_page_source(driver, url): driver.get(url) ##函数睡眠1秒,等待网页响应和渲染 time.sleep(1) page_source = driver.page_source return page_source def get_xhtml(page_source): xhtml = etree.HTML(page_source) return xhtml def parse_city(xhtml): city_name = xhtml.xpath('//div[@class="classify-content"]/div/div[@class="classify-row"]/div/a/text()') city_url = xhtml.xpath('//div[@class="classify-content"]/div/div[@class="classify-row"]/div/a/@href') ## 利用集合去重 city_set = set(zip(city_name, city_url)) city_tup = tuple(city_set) return city_tup if __name__ == '__main__': url = 'https://hotel.meituan.com/guangzhou/' driver = get_driver() ##爬取该城市的第一页酒店数据 page_source=get_page_source(driver, url) xhtml=get_xhtml(page_source) print(parse_city(xhtml))
selenium爬取美团全国酒店信息:采集全国酒店信息并除重
阅读:3378 输入:2020-10-08 15:29:07