python3爬国家统计局省市区县镇村编码信息

2022-04-19 网页编程网 网页编程网
为了得到最全最新的地理名信息从国家统计局获取省市区县镇村（包括编码）的最新信息，也称为全国城市村庄数据库。整个采集过程不难，但是生成文件有4个以上的for循环，还是会报错，决定用sleep暂停。属于长循环的采集，每次都会断，考虑用数据库。
此法可避免网络断开，会记录无法采集的url，并进行一url。
# -*- coding: utf-8 -*-
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import time
 
url='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/index.html'
headers={
        'user-agent':UserAgent().random
        }
def getData(url):
    response=requests.get(url,headers=headers)
    response.encoding='utf-8'
    soup=BeautifulSoup(response.text,'lxml')
    response.close()#must close
    return soup
        
def get_all_province(url):
    link=[]
    province=[]
    provinceCode=[]
    provinceOnly='13.html'#13.html
    soup=getData(url)
    td_list=soup.find('table',class_='provincetable').find_all("td")
    #print(td_list)
    for td in td_list:
      #urlc=td.find('a')['href']#get('href')
      #print(td)
      urlc=td.find('a',href=True)
      if urlc is not None:
          #print(urlc.get('href'))
          temp=urlc.get('href')
          if len(provinceOnly)>0:
              if temp==provinceOnly:
                  link.append('http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/'+temp)
                  province.append(urlc.get_text())
                  provinceCode.append(temp.split('.')[0])            
          else:
              link.append('http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/'+temp)
              province.append(urlc.get_text())
              provinceCode.append(temp.split('.')[0])

    return provinceCode,province,link

def get_all_city(url):
    cityLink=[]
    city=[]
    cityCode=[]
    #url='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/46.html'
    soup=getData(url)
    td_list=soup.find('table',class_='citytable').find_all("td")
    #print(td_list)
    for td in td_list:
      #urlc=td.find('a')['href']#get('href')
      #print(td)
      urlc=td.find('a',href=True)
      if urlc is not None:
          #print(urlc.get('href'))
          tempLink='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/'+urlc.get('href')
          if tempLink not in cityLink:
            cityLink.append(tempLink)
          temp=urlc.get_text()
          if temp.isalpha():
            city.append(temp)
          else:
            cityCode.append(temp)

    return cityCode,city,cityLink

def get_all_county(url):
    countyLink=[]
    county=[]
    countyCode=[]
    #url='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/13/1301.html'
    baseURL=url[0:url.rfind('/' )]
    #print('full URL'+url)
    #print('base URL'+baseURL)
    soup=getData(url)
    td_list=soup.find('table',class_='countytable').find_all("td")
    #print(td_list)
    for i,tdx in enumerate(td_list):
        if i>1:
            #print(i,tdx)
            temp=tdx.get_text()
            if i%2==0:
                countyCode.append(temp)
            else:
                county.append(tdx.get_text())
                if tdx.find('a') is None:
                    countyLink.append('')
                else:
                    urlc=tdx.find('a')
                    #print(urlc.get('href'))
                    #print(baseURL+urlc.get('href'))
                    countyLink.append(baseURL+'/'+urlc.get('href'))
    
    return countyCode,county,countyLink

def get_all_town(url):
    townLink=[]
    town=[]
    townCode=[]
    #url='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/01/110101.html'
    baseURL=url[0:url.rfind('/' )]
    soup=getData(url)
    td_list=soup.find('table',class_='towntable').find_all("td")
    #print(td_list)
    for i,tdx in enumerate(td_list):
        if i>1:
            #print(i,tdx)
            temp=tdx.get_text()
            if i%2==0:
                townCode.append(temp)
            else:
                town.append(tdx.get_text())
                if tdx.find('a') is None:
                    townLink.append('')
                else:
                    urlc=tdx.find('a')
                    townLink.append(baseURL+'/'+urlc.get('href'))
    
    return townCode,town,townLink

def get_all_committee(url):
    committeeType=[]
    committee=[]
    committeeCode=[]
    #url='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/50/02/30/500230001.html'
    soup=getData(url)
    td_list=soup.find('table',class_='villagetable').find_all("td")
    #print(td_list)
    for i,tdx in enumerate(td_list):
        if i>2:
            #print(i,tdx)
            temp=tdx.get_text()
            if i%3==0:
                committeeCode.append(temp)
            else:
                _temp=tdx.get_text()
                if _temp.isdigit():
                    committeeType.append(_temp)
                else:
                    urlc=tdx.find('a')
                    committee.append(_temp)
    
    return committeeCode,committee,committeeType
         
if __name__ =='__main__':
    #get_all_county(url)
    result="insert into city (code,name,previous,link) values "
    result="insert into county (code,name,cityCode,provinceCode) values "
    result=''
    previous_list=get_all_province(url)
    #print(previous_list)
    for i,previousCode in enumerate(previous_list[0]):
        print(previousCode,previous_list[1][i],previous_list[2][i])
        try:
            city_list=get_all_city(previous_list[2][i])
        except:
            time.sleep(3)
            with open('error-city.txt','a',encoding='utf-8') as fa:
                fa.write(previous_list[2][i]+'\n')
        for j,cityCode in enumerate(city_list[0]):
            print('   '+cityCode,city_list[1][j],city_list[2][j])
            if city_list[2][j] is not '':
                try:
                    county_list=get_all_county(city_list[2][j])
                except:
                    print('----get all county')
                    time.sleep(3)
                    with open('error-county.txt','a',encoding='utf-8') as fb:
                        fb.write(city_list[2][j]+'\n')
            for k,countyCode in enumerate(county_list[0]):
                print('      '+countyCode,county_list[1][k],county_list[2][k])
                #result='("'+countyCode+'","'+county_list[1][k]+'","'+cityCode+'","'+city_list[1][j]+'"),'
                print('---------')
                #print(county_list[0][k])
                #print(county_list[1][k])
                #print(len(county_list[2][k]))
                #if county_list[2][k] is not '':
                if len(county_list[2][k])>0:
                    try:
                        town_list=get_all_town(county_list[2][k])
                    except:
                        print('===town error====')
                        time.sleep(5)
                        with open('error.txturl','a',encoding='utf-8') as fe:
                            fe.write(county_list[2][k]+'\n')
                        continue
                else:
                    continue
                for l,townCode in enumerate(town_list[0]):
                    #print('            '+townCode,town_list[1][l],town_list[2][l])
                    time.sleep(8)
                    #result='("'+townCode+'","'+town_list[1][l]+'","'+countyCode+'","'+county_list[1][k]+'","'+cityCode+'","'+city_list[1][j]+'"),'
                    #with open('townneaw.txt','a',encoding='utf-8') as f:
                        #f.write(result+'\n')
                    #time.sleep(0.2)
                    if town_list[2][l] is not '':
                        try:
                            committee_list=get_all_committee(town_list[2][l])
                        except:
                            with open('error-town.txturl','a',encoding='utf-8') as fx:
                                fx.write(town_list[2][l]+'\n')
                            continue
                    for m,committeeCode in enumerate(committee_list[0]):
                        print('                 '+committeeCode,committee_list[1][m],committee_list[2][m])
        #             time.sleep(3)
        #             #print(result)
                        result='("'+committeeCode+'","'+committee_list[1][m]+'","'+committee_list[2][m]+'","'+townCode+'","'+town_list[1][l]+'","'+countyCode+'","'+county_list[1][k]+'","'+cityCode+'","'+city_list[1][j]+'","'+previousCode+'","'+previous_list[1][i]+'"),'
                        with open('cun.txt','a',encoding='utf-8') as fy:
                            fy.write(result+'\n')
阅读原文
阅读 3698
123 显示电脑版