from selenium import webdriver
import time
from lxml import etree
import csv
def get_driver():
driver = webdriver.Chrome()
return driver
def get_page_source(driver, url):
driver.get(url)
##函数睡眠1秒,等待网页响应和渲染
time.sleep(1)
page_source = driver.page_source
return page_source
def get_xhtml(page_source):
xhtml = etree.HTML(page_source)
return xhtml
## 这是一个生成器函数
def parse_datas(xhtml, city):
datas = {}
items = xhtml.xpath('//article[@class="poi-item"]')
for item in items:
datas['city'] = city
datas['hotel'] = item.xpath('.//a[@class="poi-title"]/text()')[0].strip()
datas['price'] = 111#item.xpath('.//div[@class="poi-price"]/em/text()')[0].strip()
datas['grade'] = item.xpath('.//div[@class="poi-grade"]/text()')[0].strip()
## 部分酒店没有消费人数的信息,如果没有,返回空字符串
try:
datas['consumer'] = item.xpath('.//div[@class="poi-buy-num"]/text()')[0].strip()
except IndexError:
datas['consumer'] = ''
datas['address'] = item.xpath('.//div[@class="poi-address"]/text()')[0].strip()
yield datas
def save_datas(datas):
with open('hotel.csv', 'a', encoding='GB18030', newline='') as c:
fieldnames = ['city', 'hotel', 'price', 'grade', 'consumer', 'address']
writer = csv.DictWriter(c, fieldnames=fieldnames)
for data in datas:
writer.writerow(data)
def click_next(driver, page_source, city):
## 变量n表示爬取进度
n = 1
print('正在爬取', city, '酒店信息', '第%d页' % n)
## 假如该城市只有1页的酒店数据,下面的while循环不执行
if 'disabled next' in page_source:
flag = False
else:
flag = True
while flag:
next_elem = driver.find_element_by_xpath('//li[@class=" next"]/a')
next_elem.click()
##睡眠一秒,等待下一页加载
time.sleep(1)
page_source = driver.page_source
xhtml = get_xhtml(page_source)
datas = parse_datas(xhtml, city)
save_datas(datas)
n += 1
print('正在爬取', city, '酒店信息', '第%d页' % n)
##这个是判定条件,判断时候还存在下一页
if 'disabled next' in page_source:
break
elif n >= 51:
break
print(city, '爬取完成', '共%d页' % n)
driver.quit()
if __name__ == '__main__':
city = '广州'
url = 'https://hotel.meituan.com/guangzhou/'
##打开浏览器
driver = get_driver()
##爬取该城市的第一页酒店数据
page_source = get_page_source(driver, url)
xhtml = get_xhtml(page_source)
datas = parse_datas(xhtml ,city)
save_datas(datas)
## 翻页
click_next(driver, page_source, city)