读远程的代理ip,并动态判断可用性,比生成一个池要好,再爬数据。其中有删除图片标签等操作。
# -*- coding: utf-8 -*-
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import re
headers={
'user-agent':UserAgent().random
}
result=''
i=0
k=0
def doReplace(str):
#删除图片
p1 = r'(
)'
str = re.sub(p1, "", str)
#删除图片
p0 = r'(
)'
str = re.sub(p0, "", str)
#a标签
p2 = r'()'
str = re.sub(p2, "", str)
p3 = r'()'
str = re.sub(p3, "", str)
#删除广告
p4 = r'(':'
','':'
','':'
','':'
','(导师微信pualove104)':''}
for key in rep:
str=str.replace(key,rep[key])
return str
import time
import random
def getIP():
url='http://webapi.http.zhimacangku.com/getip'
response=requests.get(url)
result=[]
for i in response.text.split("\n"):
# inspectIP(i)
result.append(i.replace('\r',''))
a = random.choice(result)
return a
def getOneIP():
a=getIP()
while True:
if inspectIP(a) is not None:
break
return a
def inspectIP(ipprot):
time.sleep(1)
herder={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Upgrade-Insecure-Requests':'1'
}
url='https://www.baidu.com'
proxies = { "http": "http://"+str(ipprot) }
request=requests.get(url,headers=herder,proxies=proxies)
s = requests.session()#请求太频繁
s.keep_alive = False#请求太频繁
if request.status_code==200:
print('可用'+ipprot)
return ipprot
else:
print('不可用'+ipprot)
return None
def doReplace2(str):
rep = {'':'
',' ':'
','':'
'}
for key in rep:
str=str.replace(key,rep[key])
return str
def duihua(str):
result=''
for i in str:
print(i.get_text())
return result
def getData(m):
global k
global i
result=''
url='http://liaotian.puaas.com/{}.html'.format(m)
print(url)
#设置代理ip访问方式,http和https
proxies = {
"https":getOneIP()
}
print(proxies)
response=requests.get(url,headers=headers,proxies=proxies)
time.sleep(5)
if response.status_code==200 and len(response.text)>4000:
k+=1
bs=BeautifulSoup(response.text,'html.parser')
#print(bs)
title=bs.find('h1').get_text()
#print(title.get_text())
p2=r'(">).+?( » 2:#前2行不要
resultS=''
if len(p.contents)>1:#对话
resultS+='
'
for line in p.contents:
if str(line) != '
':#过滤
resultS+=doReplace2(str(line))
result+=resultS+'
'
else:
result+=doReplace(str(p.contents[0]))
c='INSERT INTO zhy_article (id,title,type) VALUES({},"{}","{}")\nINSERT INTO zhy_article_body (aid,body,redirecturl) VALUES ({},"{}","{}")\n'.format(k,title,listName,k,result,m)
with open('sql.txt','a',encoding='utf-8') as f:
f.write(c+'\n')
for m in range(900,19900):
getData(m)