为了得到最全最新的地理名信息从国家统计局获取省市区县镇村(包括编码)的最新信息,也称为全国城市村庄数据库。整个采集过程不难,但是生成文件有4个以上的for循环,还是会报错,决定用sleep暂停。属于长循环的采集,每次都会断,考虑用数据库。
此法可避免网络断开,会记录无法采集的url,并进行一url。
# -*- coding: utf-8 -*- import requests from fake_useragent import UserAgent from bs4 import BeautifulSoup import time url='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/index.html' headers={ 'user-agent':UserAgent().random } def getData(url): response=requests.get(url,headers=headers) response.encoding='utf-8' soup=BeautifulSoup(response.text,'lxml') response.close()#must close return soup def get_all_province(url): link=[] province=[] provinceCode=[] provinceOnly='13.html'#13.html soup=getData(url) td_list=soup.find('table',class_='provincetable').find_all("td") #print(td_list) for td in td_list: #urlc=td.find('a')['href']#get('href') #print(td) urlc=td.find('a',href=True) if urlc is not None: #print(urlc.get('href')) temp=urlc.get('href') if len(provinceOnly)>0: if temp==provinceOnly: link.append('http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/'+temp) province.append(urlc.get_text()) provinceCode.append(temp.split('.')[0]) else: link.append('http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/'+temp) province.append(urlc.get_text()) provinceCode.append(temp.split('.')[0]) return provinceCode,province,link def get_all_city(url): cityLink=[] city=[] cityCode=[] #url='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/46.html' soup=getData(url) td_list=soup.find('table',class_='citytable').find_all("td") #print(td_list) for td in td_list: #urlc=td.find('a')['href']#get('href') #print(td) urlc=td.find('a',href=True) if urlc is not None: #print(urlc.get('href')) tempLink='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/'+urlc.get('href') if tempLink not in cityLink: cityLink.append(tempLink) temp=urlc.get_text() if temp.isalpha(): city.append(temp) else: cityCode.append(temp) return cityCode,city,cityLink def get_all_county(url): countyLink=[] county=[] countyCode=[] #url='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/13/1301.html' baseURL=url[0:url.rfind('/' )] #print('full URL'+url) #print('base URL'+baseURL) soup=getData(url) td_list=soup.find('table',class_='countytable').find_all("td") #print(td_list) for i,tdx in enumerate(td_list): if i>1: #print(i,tdx) temp=tdx.get_text() if i%2==0: countyCode.append(temp) else: county.append(tdx.get_text()) if tdx.find('a') is None: countyLink.append('') else: urlc=tdx.find('a') #print(urlc.get('href')) #print(baseURL+urlc.get('href')) countyLink.append(baseURL+'/'+urlc.get('href')) return countyCode,county,countyLink def get_all_town(url): townLink=[] town=[] townCode=[] #url='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/01/110101.html' baseURL=url[0:url.rfind('/' )] soup=getData(url) td_list=soup.find('table',class_='towntable').find_all("td") #print(td_list) for i,tdx in enumerate(td_list): if i>1: #print(i,tdx) temp=tdx.get_text() if i%2==0: townCode.append(temp) else: town.append(tdx.get_text()) if tdx.find('a') is None: townLink.append('') else: urlc=tdx.find('a') townLink.append(baseURL+'/'+urlc.get('href')) return townCode,town,townLink def get_all_committee(url): committeeType=[] committee=[] committeeCode=[] #url='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/50/02/30/500230001.html' soup=getData(url) td_list=soup.find('table',class_='villagetable').find_all("td") #print(td_list) for i,tdx in enumerate(td_list): if i>2: #print(i,tdx) temp=tdx.get_text() if i%3==0: committeeCode.append(temp) else: _temp=tdx.get_text() if _temp.isdigit(): committeeType.append(_temp) else: urlc=tdx.find('a') committee.append(_temp) return committeeCode,committee,committeeType if __name__ =='__main__': #get_all_county(url) result="insert into city (code,name,previous,link) values " result="insert into county (code,name,cityCode,provinceCode) values " result='' previous_list=get_all_province(url) #print(previous_list) for i,previousCode in enumerate(previous_list[0]): print(previousCode,previous_list[1][i],previous_list[2][i]) try: city_list=get_all_city(previous_list[2][i]) except: time.sleep(3) with open('error-city.txt','a',encoding='utf-8') as fa: fa.write(previous_list[2][i]+'\n') for j,cityCode in enumerate(city_list[0]): print(' '+cityCode,city_list[1][j],city_list[2][j]) if city_list[2][j] is not '': try: county_list=get_all_county(city_list[2][j]) except: print('----get all county') time.sleep(3) with open('error-county.txt','a',encoding='utf-8') as fb: fb.write(city_list[2][j]+'\n') for k,countyCode in enumerate(county_list[0]): print(' '+countyCode,county_list[1][k],county_list[2][k]) #result='("'+countyCode+'","'+county_list[1][k]+'","'+cityCode+'","'+city_list[1][j]+'"),' print('---------') #print(county_list[0][k]) #print(county_list[1][k]) #print(len(county_list[2][k])) #if county_list[2][k] is not '': if len(county_list[2][k])>0: try: town_list=get_all_town(county_list[2][k]) except: print('===town error====') time.sleep(5) with open('error.txturl','a',encoding='utf-8') as fe: fe.write(county_list[2][k]+'\n') continue else: continue for l,townCode in enumerate(town_list[0]): #print(' '+townCode,town_list[1][l],town_list[2][l]) time.sleep(8) #result='("'+townCode+'","'+town_list[1][l]+'","'+countyCode+'","'+county_list[1][k]+'","'+cityCode+'","'+city_list[1][j]+'"),' #with open('townneaw.txt','a',encoding='utf-8') as f: #f.write(result+'\n') #time.sleep(0.2) if town_list[2][l] is not '': try: committee_list=get_all_committee(town_list[2][l]) except: with open('error-town.txturl','a',encoding='utf-8') as fx: fx.write(town_list[2][l]+'\n') continue for m,committeeCode in enumerate(committee_list[0]): print(' '+committeeCode,committee_list[1][m],committee_list[2][m]) # time.sleep(3) # #print(result) result='("'+committeeCode+'","'+committee_list[1][m]+'","'+committee_list[2][m]+'","'+townCode+'","'+town_list[1][l]+'","'+countyCode+'","'+county_list[1][k]+'","'+cityCode+'","'+city_list[1][j]+'","'+previousCode+'","'+previous_list[1][i]+'"),' with open('cun.txt','a',encoding='utf-8') as fy: fy.write(result+'\n')