1.背景

采集期刊分区表,对方网络是通过json展示,且有是否订阅判断,测压直接读json失败后,决定使用selenium。

2.技术

对方用的伪元素防采集,难点是driver操作表格。

3.代码

通过查看源代码,查看,以下代码有转义,显示有问题。

import time
import re
from selenium import webdriver

from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains


url='https://www-fenqubiao-com.vpn.cust.edu.cn/'
#url='http://localhost/3.html'

option = webdriver.ChromeOptions()
#option.add_argument('headless')
driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=option)
driver.get(url)
page_src = driver.page_source
#print(page_src)

listName='医学'

time.sleep(25)

driver.find_element_by_id('Username').clear()
driver.find_element_by_id('Username').send_keys('cust')
driver.find_element_by_id('Password').clear()
driver.find_element_by_id('Password').send_keys('cust')
time.sleep(1)

driver.find_element_by_id('login_button').click()

#click shengjiban

driver.find_element_by_id('login_button').click()
time.sleep(1)

aElements = driver.find_elements_by_tag_name("a")
for name in aElements:
    if(name.get_attribute("href") is not None and "/Connect/NewVersion.aspx" in name.get_attribute("href")):
        print("go to 2022 page")
        name.click()
        break
time.sleep(2)

driver.find_element_by_link_text(listName).click()

'''
#when error happened when long pages, use this method
for i in range(1,177):#now begin in page 177
    time.sleep(2)
    print('now page',i)
    nextPage=driver.find_element_by_link_text('Next')
    nextPage.click()
'''
#a range with index and value
#xIndex which index need is a str
#bNumber:['0', '2', '1', '3', '3', '2']
#xIndex:'0'
def findIndex(bNumber,xIndex):
    for i in range(0,len(bNumber)):
        if xIndex == bNumber[i] and i%2 ==0:
            return bNumber[i+1]
    
#print(findIndex(bNumber,'3'))

time.sleep(1)
i=0
while True:
    table = driver.find_element_by_id('myTable')

    #total row
    table_rows = table.find_elements_by_tag_name('tr')
    #print(u"row", len(table_rows))
    maxRow=len(table_rows)

    aElements=table.find_elements_by_tag_name('a')
    h1=driver.window_handles

    page_source = driver.page_source

    for aElement in aElements:
        time.sleep(3)
        aElement.click()
        time.sleep(2)
        h2=driver.window_handles
        driver.switch_to.window(h2[1])
        currentPageUrl=driver.current_url
        driver.get(currentPageUrl)
        time.sleep(2)
        currentPage_source = driver.page_source

        page_src_noBR = currentPage_source.replace('\r','').replace('\n','').replace('\t','')
        #page_src_noBRSPACE=re.sub(r'[\s]+', '', page_src)
        zoneAll=re.findall(r"",page_src_noBR)
        print(zoneAll)
        bNumber=re.findall(r"[0-9]",zoneAll[0])

        table = driver.find_elements_by_tag_name('table')

        #print(table)

        #total row
        table_rows = table[0].find_elements_by_tag_name('tr')

        title = table_rows[0].find_elements_by_tag_name('td')[1].text
        year = table_rows[1].find_elements_by_tag_name('td')[1].text

        ISSN = table_rows[2].find_elements_by_tag_name('td')[1].text
            
        review = table_rows[3].find_elements_by_tag_name('td')[1].text
        access = table_rows[4].find_elements_by_tag_name('td')[1].text
        WoS = table_rows[5].find_elements_by_tag_name('td')[1].text

        #print(title,year,ISSN,review,access,WoS)
        print('         '+title,year,ISSN,review,access,WoS)
        with open(listName+'index.txt','a',encoding='utf-8') as f:
            f.write('("{}","{}","{}","{}","{}","{}","{}","{}"),'.format(listName,currentPageUrl,title,year,ISSN,review,access,WoS))


        #only to handle isTop
        texts=''
        TRs= table[1].find_elements_by_tag_name('tr')
        for rows,TR in enumerate(TRs):
            if rows>0:
                TDs=TR.find_elements_by_tag_name('td')
                for index,td in enumerate(TDs):
                    #if index>0:
                     texts+=td.text

        zones=re.findall(r'href="/M(.*?)',page_src_noBR)
  
        _fenqu=re.findall(r'',page_src_noBR)
        #print(_fenqu)
        #print(zones)
        for index,zone in enumerate(zones):
            #print(zone+'\n')
            _name=re.findall(r'>(.*?)',zone)
                print(name,findIndex(bNumber,_fenqu[index][1]),_suoxie[0])
                with open(listName+'indexSmall.txt','a',encoding='utf-8') as f3:
                    f3.write('("{}","{}","{}","{}","{}"),'.format(ISSN,currentPageUrl,name,findIndex(bNumber,_fenqu[index][1]),_suoxie[0]))


        driver.close()#close current page
        time.sleep(0.5)
        driver.switch_to.window(h2[0])#back to main list page
    i+=1
    print('page:',i)
    if 'paginate_button next disabled"' in page_source:
        break
    nextPage=driver.find_element_by_link_text('Next')
    nextPage.click()
    time.sleep(0.5)