采集期刊分区表,对方网络是通过json展示,且有是否订阅判断,测压直接读json失败后,决定使用selenium。
对方用的伪元素防采集,难点是driver操作表格。
通过查看源代码,查看,以下代码有转义,显示有问题。
import time
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
url='https://www-fenqubiao-com.vpn.cust.edu.cn/'
#url='http://localhost/3.html'
option = webdriver.ChromeOptions()
#option.add_argument('headless')
driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=option)
driver.get(url)
page_src = driver.page_source
#print(page_src)
listName='医学'
time.sleep(25)
driver.find_element_by_id('Username').clear()
driver.find_element_by_id('Username').send_keys('cust')
driver.find_element_by_id('Password').clear()
driver.find_element_by_id('Password').send_keys('cust')
time.sleep(1)
driver.find_element_by_id('login_button').click()
#click shengjiban
driver.find_element_by_id('login_button').click()
time.sleep(1)
aElements = driver.find_elements_by_tag_name("a")
for name in aElements:
if(name.get_attribute("href") is not None and "/Connect/NewVersion.aspx" in name.get_attribute("href")):
print("go to 2022 page")
name.click()
break
time.sleep(2)
driver.find_element_by_link_text(listName).click()
'''
#when error happened when long pages, use this method
for i in range(1,177):#now begin in page 177
time.sleep(2)
print('now page',i)
nextPage=driver.find_element_by_link_text('Next')
nextPage.click()
'''
#a range with index and value
#xIndex which index need is a str
#bNumber:['0', '2', '1', '3', '3', '2']
#xIndex:'0'
def findIndex(bNumber,xIndex):
for i in range(0,len(bNumber)):
if xIndex == bNumber[i] and i%2 ==0:
return bNumber[i+1]
#print(findIndex(bNumber,'3'))
time.sleep(1)
i=0
while True:
table = driver.find_element_by_id('myTable')
#total row
table_rows = table.find_elements_by_tag_name('tr')
#print(u"row", len(table_rows))
maxRow=len(table_rows)
aElements=table.find_elements_by_tag_name('a')
h1=driver.window_handles
page_source = driver.page_source
for aElement in aElements:
time.sleep(3)
aElement.click()
time.sleep(2)
h2=driver.window_handles
driver.switch_to.window(h2[1])
currentPageUrl=driver.current_url
driver.get(currentPageUrl)
time.sleep(2)
currentPage_source = driver.page_source
page_src_noBR = currentPage_source.replace('\r','').replace('\n','').replace('\t','')
#page_src_noBRSPACE=re.sub(r'[\s]+', '', page_src)
zoneAll=re.findall(r"",page_src_noBR)
print(zoneAll)
bNumber=re.findall(r"[0-9]",zoneAll[0])
table = driver.find_elements_by_tag_name('table')
#print(table)
#total row
table_rows = table[0].find_elements_by_tag_name('tr')
title = table_rows[0].find_elements_by_tag_name('td')[1].text
year = table_rows[1].find_elements_by_tag_name('td')[1].text
ISSN = table_rows[2].find_elements_by_tag_name('td')[1].text
review = table_rows[3].find_elements_by_tag_name('td')[1].text
access = table_rows[4].find_elements_by_tag_name('td')[1].text
WoS = table_rows[5].find_elements_by_tag_name('td')[1].text
#print(title,year,ISSN,review,access,WoS)
print(' '+title,year,ISSN,review,access,WoS)
with open(listName+'index.txt','a',encoding='utf-8') as f:
f.write('("{}","{}","{}","{}","{}","{}","{}","{}"),'.format(listName,currentPageUrl,title,year,ISSN,review,access,WoS))
#only to handle isTop
texts=''
TRs= table[1].find_elements_by_tag_name('tr')
for rows,TR in enumerate(TRs):
if rows>0:
TDs=TR.find_elements_by_tag_name('td')
for index,td in enumerate(TDs):
#if index>0:
texts+=td.text
zones=re.findall(r'href="/M(.*?)',page_src_noBR)
_fenqu=re.findall(r'',page_src_noBR)
#print(_fenqu)
#print(zones)
for index,zone in enumerate(zones):
#print(zone+'\n')
_name=re.findall(r'>(.*?)',zone)
print(name,findIndex(bNumber,_fenqu[index][1]),_suoxie[0])
with open(listName+'indexSmall.txt','a',encoding='utf-8') as f3:
f3.write('("{}","{}","{}","{}","{}"),'.format(ISSN,currentPageUrl,name,findIndex(bNumber,_fenqu[index][1]),_suoxie[0]))
driver.close()#close current page
time.sleep(0.5)
driver.switch_to.window(h2[0])#back to main list page
i+=1
print('page:',i)
if 'paginate_button next disabled"' in page_source:
break
nextPage=driver.find_element_by_link_text('Next')
nextPage.click()
time.sleep(0.5)