采集期刊分区表,对方网络是通过json展示,且有是否订阅判断,测压直接读json失败后,决定使用selenium。
对方用的伪元素防采集,难点是driver操作表格。
通过查看源代码,查看,以下代码有转义,显示有问题。
import time import re from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.action_chains import ActionChains url='https://www-fenqubiao-com.vpn.cust.edu.cn/' #url='http://localhost/3.html' option = webdriver.ChromeOptions() #option.add_argument('headless') driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=option) driver.get(url) page_src = driver.page_source #print(page_src) listName='医学' time.sleep(25) driver.find_element_by_id('Username').clear() driver.find_element_by_id('Username').send_keys('cust') driver.find_element_by_id('Password').clear() driver.find_element_by_id('Password').send_keys('cust') time.sleep(1) driver.find_element_by_id('login_button').click() #click shengjiban driver.find_element_by_id('login_button').click() time.sleep(1) aElements = driver.find_elements_by_tag_name("a") for name in aElements: if(name.get_attribute("href") is not None and "/Connect/NewVersion.aspx" in name.get_attribute("href")): print("go to 2022 page") name.click() break time.sleep(2) driver.find_element_by_link_text(listName).click() ''' #when error happened when long pages, use this method for i in range(1,177):#now begin in page 177 time.sleep(2) print('now page',i) nextPage=driver.find_element_by_link_text('Next') nextPage.click() ''' #a range with index and value #xIndex which index need is a str #bNumber:['0', '2', '1', '3', '3', '2'] #xIndex:'0' def findIndex(bNumber,xIndex): for i in range(0,len(bNumber)): if xIndex == bNumber[i] and i%2 ==0: return bNumber[i+1] #print(findIndex(bNumber,'3')) time.sleep(1) i=0 while True: table = driver.find_element_by_id('myTable') #total row table_rows = table.find_elements_by_tag_name('tr') #print(u"row", len(table_rows)) maxRow=len(table_rows) aElements=table.find_elements_by_tag_name('a') h1=driver.window_handles page_source = driver.page_source for aElement in aElements: time.sleep(3) aElement.click() time.sleep(2) h2=driver.window_handles driver.switch_to.window(h2[1]) currentPageUrl=driver.current_url driver.get(currentPageUrl) time.sleep(2) currentPage_source = driver.page_source page_src_noBR = currentPage_source.replace('\r','').replace('\n','').replace('\t','') #page_src_noBRSPACE=re.sub(r'[\s]+', '', page_src) zoneAll=re.findall(r"",page_src_noBR) print(zoneAll) bNumber=re.findall(r"[0-9]",zoneAll[0]) table = driver.find_elements_by_tag_name('table') #print(table) #total row table_rows = table[0].find_elements_by_tag_name('tr') title = table_rows[0].find_elements_by_tag_name('td')[1].text year = table_rows[1].find_elements_by_tag_name('td')[1].text ISSN = table_rows[2].find_elements_by_tag_name('td')[1].text review = table_rows[3].find_elements_by_tag_name('td')[1].text access = table_rows[4].find_elements_by_tag_name('td')[1].text WoS = table_rows[5].find_elements_by_tag_name('td')[1].text #print(title,year,ISSN,review,access,WoS) print(' '+title,year,ISSN,review,access,WoS) with open(listName+'index.txt','a',encoding='utf-8') as f: f.write('("{}","{}","{}","{}","{}","{}","{}","{}"),'.format(listName,currentPageUrl,title,year,ISSN,review,access,WoS)) #only to handle isTop texts='' TRs= table[1].find_elements_by_tag_name('tr') for rows,TR in enumerate(TRs): if rows>0: TDs=TR.find_elements_by_tag_name('td') for index,td in enumerate(TDs): #if index>0: texts+=td.text zones=re.findall(r'href="/M(.*?)',page_src_noBR) _fenqu=re.findall(r'',page_src_noBR) #print(_fenqu) #print(zones) for index,zone in enumerate(zones): #print(zone+'\n') _name=re.findall(r'>(.*?)',zone) print(name,findIndex(bNumber,_fenqu[index][1]),_suoxie[0]) with open(listName+'indexSmall.txt','a',encoding='utf-8') as f3: f3.write('("{}","{}","{}","{}","{}"),'.format(ISSN,currentPageUrl,name,findIndex(bNumber,_fenqu[index][1]),_suoxie[0])) driver.close()#close current page time.sleep(0.5) driver.switch_to.window(h2[0])#back to main list page i+=1 print('page:',i) if 'paginate_button next disabled"' in page_source: break nextPage=driver.find_element_by_link_text('Next') nextPage.click() time.sleep(0.5)