主页 M

selenium爬取美团全国酒店信息:采集全国酒店信息并除重

2020-10-08 网页编程网 网页编程网
from selenium import webdriver
import time
from lxml import etree

def get_driver():
	driver = webdriver.Chrome()
	return driver

def get_page_source(driver, url):
	driver.get(url)
	##函数睡眠1秒,等待网页响应和渲染
	time.sleep(1)
	page_source = driver.page_source
	return page_source

def get_xhtml(page_source):
	xhtml = etree.HTML(page_source)
	return xhtml

def parse_city(xhtml):
	city_name = xhtml.xpath('//div[@class="classify-content"]/div/div[@class="classify-row"]/div/a/text()')
	city_url = xhtml.xpath('//div[@class="classify-content"]/div/div[@class="classify-row"]/div/a/@href')
	## 利用集合去重
	city_set = set(zip(city_name, city_url))
	city_tup = tuple(city_set)
	return city_tup

if __name__ == '__main__':
	url  = 'https://hotel.meituan.com/guangzhou/'
	driver 		= get_driver()
	##爬取该城市的第一页酒店数据
	page_source=get_page_source(driver, url)
	xhtml=get_xhtml(page_source)
	print(parse_city(xhtml))
阅读原文
阅读 3382
123 显示电脑版