from selenium import webdriver import time from lxml import etree import csv def get_driver(): driver = webdriver.Chrome() return driver def get_page_source(driver, url): driver.get(url) ##函数睡眠1秒,等待网页响应和渲染 time.sleep(1) page_source = driver.page_source return page_source def get_xhtml(page_source): xhtml = etree.HTML(page_source) return xhtml ## 这是一个生成器函数 def parse_datas(xhtml, city): datas = {} items = xhtml.xpath('//article[@class="poi-item"]') for item in items: datas['city'] = city datas['hotel'] = item.xpath('.//a[@class="poi-title"]/text()')[0].strip() datas['price'] = 111#item.xpath('.//div[@class="poi-price"]/em/text()')[0].strip() datas['grade'] = item.xpath('.//div[@class="poi-grade"]/text()')[0].strip() ## 部分酒店没有消费人数的信息,如果没有,返回空字符串 try: datas['consumer'] = item.xpath('.//div[@class="poi-buy-num"]/text()')[0].strip() except IndexError: datas['consumer'] = '' datas['address'] = item.xpath('.//div[@class="poi-address"]/text()')[0].strip() yield datas def save_datas(datas): with open('hotel.csv', 'a', encoding='GB18030', newline='') as c: fieldnames = ['city', 'hotel', 'price', 'grade', 'consumer', 'address'] writer = csv.DictWriter(c, fieldnames=fieldnames) for data in datas: writer.writerow(data) if __name__ == '__main__': city = '广州' url = 'https://hotel.meituan.com/guangzhou/' driver = get_driver() page_source = get_page_source(driver, url) xhtml = get_xhtml(page_source) datas = parse_datas(xhtml ,city) save_datas(datas) driver.quit()
selenium爬取美团全国酒店信息保存excel
阅读:2989 输入:2020-10-08 14:57:44