import requests
import time
from lxml import etree
 
 
def analysis_html(address, pattern):
    global trs
    response = requests.get(address, headers=headers)
    response.encoding = 'utf-8'
    text = response.text
    html = etree.HTML(text)
    trs = html.xpath(pattern)
 
 
def make_url(url_before, pattern):
    return url_before + tr.xpath(pattern)[0]
 
 
def province():
    global tr
    analysis_html(f'{main_address}index.html', '//tr[@class="provincetr"]/td')
    for tr in trs[:-1]:
        province = tr.xpath('./a/text()')[0]
        province_url = make_url(main_address, './a/@href')
        print(province)
        city(province_url, province)
        time.sleep(1)
 
 
def city(province_url, province):
    analysis_html(province_url, '//tr[@class="citytr"]')
    for tr in trs:
        city = tr.xpath('./td[2]/a/text()')[0]
        #city_url = make_url(main_address, './td[1]/a/@href') #why?
        page = tr.xpath('./td[1]/a/@href')[0]
        city_url = main_address + page
 
        city_id = tr.xpath('./td[1]/a/text()')[0]
 
        country(city_url, city, city_id, province, province_url)
        time.sleep(1)
 
 
def country(city_url, city, city_id, province, province_url):
    analysis_html(city_url, '//tr[@class="countytr"]')
    for tr in trs:
        try:
            country = tr.xpath('./td[2]/a/text()')[0]
            country_id = tr.xpath('./td[1]/a/text()')[0]
 
            # page = tr.xpath('./td[1]/a/@href')[0]
            # country_url = province_url.replace('.html','/') + page
 
            f.write(f'{country},{country_id}\t,{city},{city_id}\t,{province}\n')
            time.sleep(1)
        except:
            pass
 
 
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.204 Safari/537.36',
    'Cookie':'AD_RS_COOKIE=20080918; _trs_uv=kahvgie3_6_fc6v'
}
main_address = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/"
 
with open(r'省市区.csv', 'a',encoding='utf-8') as f:
    province()