from selenium import webdriver import time from lxml import etree import csv def get_driver(): driver = webdriver.Chrome() return driver def get_page_source(driver, url): driver.get(url) ##函数睡眠1秒,等待网页响应和渲染 time.sleep(1) page_source = driver.page_source return page_source def get_xhtml(page_source): xhtml = etree.HTML(page_source) return xhtml ## 这是一个生成器函数 def parse_datas(xhtml, city): datas = {} items = xhtml.xpath('//article[@class="poi-item"]') for item in items: datas['city'] = city datas['hotel'] = item.xpath('.//a[@class="poi-title"]/text()')[0].strip() datas['price'] = 111#item.xpath('.//div[@class="poi-price"]/em/text()')[0].strip() datas['grade'] = item.xpath('.//div[@class="poi-grade"]/text()')[0].strip() ## 部分酒店没有消费人数的信息,如果没有,返回空字符串 try: datas['consumer'] = item.xpath('.//div[@class="poi-buy-num"]/text()')[0].strip() except IndexError: datas['consumer'] = '' datas['address'] = item.xpath('.//div[@class="poi-address"]/text()')[0].strip() yield datas def save_datas(datas): with open('hotel.csv', 'a', encoding='GB18030', newline='') as c: fieldnames = ['city', 'hotel', 'price', 'grade', 'consumer', 'address'] writer = csv.DictWriter(c, fieldnames=fieldnames) for data in datas: writer.writerow(data) def click_next(driver, page_source, city): ## 变量n表示爬取进度 n = 1 print('正在爬取', city, '酒店信息', '第%d页' % n) ## 假如该城市只有1页的酒店数据,下面的while循环不执行 if 'disabled next' in page_source: flag = False else: flag = True while flag: next_elem = driver.find_element_by_xpath('//li[@class=" next"]/a') next_elem.click() ##睡眠一秒,等待下一页加载 time.sleep(1) page_source = driver.page_source xhtml = get_xhtml(page_source) datas = parse_datas(xhtml, city) save_datas(datas) n += 1 print('正在爬取', city, '酒店信息', '第%d页' % n) ##这个是判定条件,判断时候还存在下一页 if 'disabled next' in page_source: break elif n >= 51: break print(city, '爬取完成', '共%d页' % n) driver.quit() if __name__ == '__main__': city = '广州' url = 'https://hotel.meituan.com/guangzhou/' ##打开浏览器 driver = get_driver() ##爬取该城市的第一页酒店数据 page_source = get_page_source(driver, url) xhtml = get_xhtml(page_source) datas = parse_datas(xhtml ,city) save_datas(datas) ## 翻页 click_next(driver, page_source, city)