为了得到最全最新的地理名信息从国家统计局获取省市区县镇村(包括编码)的最新信息,也称为全国城市村庄数据库。整个采集过程不难,但是生成文件有4个以上的for循环,还是会报错,决定用sleep暂停。属于长循环的采集,每次都会断,考虑用数据库。
此法可避免网络断开,会记录无法采集的url,并进行一url。
# -*- coding: utf-8 -*-
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import time
url='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/index.html'
headers={
'user-agent':UserAgent().random
}
def getData(url):
response=requests.get(url,headers=headers)
response.encoding='utf-8'
soup=BeautifulSoup(response.text,'lxml')
response.close()#must close
return soup
def get_all_province(url):
link=[]
province=[]
provinceCode=[]
provinceOnly='13.html'#13.html
soup=getData(url)
td_list=soup.find('table',class_='provincetable').find_all("td")
#print(td_list)
for td in td_list:
#urlc=td.find('a')['href']#get('href')
#print(td)
urlc=td.find('a',href=True)
if urlc is not None:
#print(urlc.get('href'))
temp=urlc.get('href')
if len(provinceOnly)>0:
if temp==provinceOnly:
link.append('http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/'+temp)
province.append(urlc.get_text())
provinceCode.append(temp.split('.')[0])
else:
link.append('http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/'+temp)
province.append(urlc.get_text())
provinceCode.append(temp.split('.')[0])
return provinceCode,province,link
def get_all_city(url):
cityLink=[]
city=[]
cityCode=[]
#url='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/46.html'
soup=getData(url)
td_list=soup.find('table',class_='citytable').find_all("td")
#print(td_list)
for td in td_list:
#urlc=td.find('a')['href']#get('href')
#print(td)
urlc=td.find('a',href=True)
if urlc is not None:
#print(urlc.get('href'))
tempLink='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/'+urlc.get('href')
if tempLink not in cityLink:
cityLink.append(tempLink)
temp=urlc.get_text()
if temp.isalpha():
city.append(temp)
else:
cityCode.append(temp)
return cityCode,city,cityLink
def get_all_county(url):
countyLink=[]
county=[]
countyCode=[]
#url='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/13/1301.html'
baseURL=url[0:url.rfind('/' )]
#print('full URL'+url)
#print('base URL'+baseURL)
soup=getData(url)
td_list=soup.find('table',class_='countytable').find_all("td")
#print(td_list)
for i,tdx in enumerate(td_list):
if i>1:
#print(i,tdx)
temp=tdx.get_text()
if i%2==0:
countyCode.append(temp)
else:
county.append(tdx.get_text())
if tdx.find('a') is None:
countyLink.append('')
else:
urlc=tdx.find('a')
#print(urlc.get('href'))
#print(baseURL+urlc.get('href'))
countyLink.append(baseURL+'/'+urlc.get('href'))
return countyCode,county,countyLink
def get_all_town(url):
townLink=[]
town=[]
townCode=[]
#url='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/01/110101.html'
baseURL=url[0:url.rfind('/' )]
soup=getData(url)
td_list=soup.find('table',class_='towntable').find_all("td")
#print(td_list)
for i,tdx in enumerate(td_list):
if i>1:
#print(i,tdx)
temp=tdx.get_text()
if i%2==0:
townCode.append(temp)
else:
town.append(tdx.get_text())
if tdx.find('a') is None:
townLink.append('')
else:
urlc=tdx.find('a')
townLink.append(baseURL+'/'+urlc.get('href'))
return townCode,town,townLink
def get_all_committee(url):
committeeType=[]
committee=[]
committeeCode=[]
#url='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/50/02/30/500230001.html'
soup=getData(url)
td_list=soup.find('table',class_='villagetable').find_all("td")
#print(td_list)
for i,tdx in enumerate(td_list):
if i>2:
#print(i,tdx)
temp=tdx.get_text()
if i%3==0:
committeeCode.append(temp)
else:
_temp=tdx.get_text()
if _temp.isdigit():
committeeType.append(_temp)
else:
urlc=tdx.find('a')
committee.append(_temp)
return committeeCode,committee,committeeType
if __name__ =='__main__':
#get_all_county(url)
result="insert into city (code,name,previous,link) values "
result="insert into county (code,name,cityCode,provinceCode) values "
result=''
previous_list=get_all_province(url)
#print(previous_list)
for i,previousCode in enumerate(previous_list[0]):
print(previousCode,previous_list[1][i],previous_list[2][i])
try:
city_list=get_all_city(previous_list[2][i])
except:
time.sleep(3)
with open('error-city.txt','a',encoding='utf-8') as fa:
fa.write(previous_list[2][i]+'\n')
for j,cityCode in enumerate(city_list[0]):
print(' '+cityCode,city_list[1][j],city_list[2][j])
if city_list[2][j] is not '':
try:
county_list=get_all_county(city_list[2][j])
except:
print('----get all county')
time.sleep(3)
with open('error-county.txt','a',encoding='utf-8') as fb:
fb.write(city_list[2][j]+'\n')
for k,countyCode in enumerate(county_list[0]):
print(' '+countyCode,county_list[1][k],county_list[2][k])
#result='("'+countyCode+'","'+county_list[1][k]+'","'+cityCode+'","'+city_list[1][j]+'"),'
print('---------')
#print(county_list[0][k])
#print(county_list[1][k])
#print(len(county_list[2][k]))
#if county_list[2][k] is not '':
if len(county_list[2][k])>0:
try:
town_list=get_all_town(county_list[2][k])
except:
print('===town error====')
time.sleep(5)
with open('error.txturl','a',encoding='utf-8') as fe:
fe.write(county_list[2][k]+'\n')
continue
else:
continue
for l,townCode in enumerate(town_list[0]):
#print(' '+townCode,town_list[1][l],town_list[2][l])
time.sleep(8)
#result='("'+townCode+'","'+town_list[1][l]+'","'+countyCode+'","'+county_list[1][k]+'","'+cityCode+'","'+city_list[1][j]+'"),'
#with open('townneaw.txt','a',encoding='utf-8') as f:
#f.write(result+'\n')
#time.sleep(0.2)
if town_list[2][l] is not '':
try:
committee_list=get_all_committee(town_list[2][l])
except:
with open('error-town.txturl','a',encoding='utf-8') as fx:
fx.write(town_list[2][l]+'\n')
continue
for m,committeeCode in enumerate(committee_list[0]):
print(' '+committeeCode,committee_list[1][m],committee_list[2][m])
# time.sleep(3)
# #print(result)
result='("'+committeeCode+'","'+committee_list[1][m]+'","'+committee_list[2][m]+'","'+townCode+'","'+town_list[1][l]+'","'+countyCode+'","'+county_list[1][k]+'","'+cityCode+'","'+city_list[1][j]+'","'+previousCode+'","'+previous_list[1][i]+'"),'
with open('cun.txt','a',encoding='utf-8') as fy:
fy.write(result+'\n')