import requests import time from lxml import etree def analysis_html(address, pattern): global trs response = requests.get(address, headers=headers) response.encoding = 'utf-8' text = response.text html = etree.HTML(text) trs = html.xpath(pattern) def make_url(url_before, pattern): return url_before + tr.xpath(pattern)[0] def province(): global tr analysis_html(f'{main_address}index.html', '//tr[@class="provincetr"]/td') for tr in trs[:-1]: province = tr.xpath('./a/text()')[0] province_url = make_url(main_address, './a/@href') print(province) city(province_url, province) time.sleep(1) def city(province_url, province): analysis_html(province_url, '//tr[@class="citytr"]') for tr in trs: city = tr.xpath('./td[2]/a/text()')[0] #city_url = make_url(main_address, './td[1]/a/@href') #why? page = tr.xpath('./td[1]/a/@href')[0] city_url = main_address + page city_id = tr.xpath('./td[1]/a/text()')[0] country(city_url, city, city_id, province, province_url) time.sleep(1) def country(city_url, city, city_id, province, province_url): analysis_html(city_url, '//tr[@class="countytr"]') for tr in trs: try: country = tr.xpath('./td[2]/a/text()')[0] country_id = tr.xpath('./td[1]/a/text()')[0] # page = tr.xpath('./td[1]/a/@href')[0] # country_url = province_url.replace('.html','/') + page f.write(f'{country},{country_id}\t,{city},{city_id}\t,{province}\n') time.sleep(1) except: pass headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.204 Safari/537.36', 'Cookie':'AD_RS_COOKIE=20080918; _trs_uv=kahvgie3_6_fc6v' } main_address = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/" with open(r'省市区.csv', 'a',encoding='utf-8') as f: province()