Python代理IP并判断可用性，应用BeautifulSoup爬虫范例-网页编程网

当前位置：主页 >> Python 3 >> 正文

Python代理IP并判断可用性，应用BeautifulSoup爬虫范例

阅读：3818 输入：2020-12-08 11:02:13

读远程的代理ip，并动态判断可用性，比生成一个池要好，再爬数据。其中有删除图片标签等操作。

# -*- coding: utf-8 -*-
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import re

headers={
        'user-agent':UserAgent().random
        }

result=''
i=0
k=0
def doReplace(str):
    #删除图片
    p1 = r'()'
    str = re.sub(p1, "", str)
    #删除图片
    p0 = r'()'
    str = re.sub(p0, "", str)    
    #a标签
    p2 = r'()'
    str = re.sub(p2, "", str)
    p3 = r'()'
    str = re.sub(p3, "", str)
    #删除广告
    p4 = r'(':'

	','':'

','':'

	','':'

','（导师微信pualove104）':''}
    for key in rep:
        str=str.replace(key,rep[key])
    return str

import time
import random
def getIP():
    url='http://webapi.http.zhimacangku.com/getip'
    response=requests.get(url)
    result=[]
    for i in response.text.split("\n"):
    #    inspectIP(i)
        result.append(i.replace('\r',''))
        
    a = random.choice(result)
    return a

def getOneIP():
    a=getIP()
    while True:
      if inspectIP(a) is not None:
         break
    return a


def inspectIP(ipprot):
     time.sleep(1)
     herder={
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
         'Accept-Encoding':'gzip, deflate',
         'Accept-Language':'zh-CN,zh;q=0.9',
         'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
         'Upgrade-Insecure-Requests':'1'
     }

     url='https://www.baidu.com'
     proxies = { "http": "http://"+str(ipprot) }
     request=requests.get(url,headers=herder,proxies=proxies)
     s = requests.session()#请求太频繁
     s.keep_alive = False#请求太频繁
     if request.status_code==200:
         print('可用'+ipprot)
         return ipprot
     else:
         print('不可用'+ipprot)
         return None
        
    
def doReplace2(str):
    rep = {'':'

	',' ':'

','':'

	



'}
    for key in rep:
        str=str.replace(key,rep[key])
    return str
def duihua(str):
    result=''
    for i in str:
        print(i.get_text())
    return result

    
def getData(m):
    global k
    global i
    result=''
    url='http://liaotian.puaas.com/{}.html'.format(m)
    print(url)

  #设置代理ip访问方式，http和https
    proxies = {
        "https":getOneIP()
        }
    print(proxies)
    response=requests.get(url,headers=headers,proxies=proxies)
    time.sleep(5)
    if response.status_code==200 and len(response.text)>4000:
    
        k+=1
        bs=BeautifulSoup(response.text,'html.parser')
        #print(bs)
        
    
        title=bs.find('h1').get_text()
        #print(title.get_text())
        p2=r'(">).+?( » 2:#前2行不要
                resultS=''
                if len(p.contents)>1:#对话
                    resultS+='

	'
                    for line in p.contents:
                        if str(line) != '
':#过滤
 resultS+=doReplace2(str(line))
                    
                    result+=resultS+'

'
                else:
                    result+=doReplace(str(p.contents[0]))
        
        c='INSERT INTO zhy_article (id,title,type) VALUES({},"{}","{}")\nINSERT INTO zhy_article_body (aid,body,redirecturl) VALUES ({},"{}","{}")\n'.format(k,title,listName,k,result,m)
        with open('sql.txt','a',encoding='utf-8') as f:
            f.write(c+'\n')
            

for m in range(900,19900):
    getData(m)

上一篇：Ryu多控制器通信
下一篇：用ffmpeg实现m3u8解密文件的解密及ts文件的合并

相关阅读: python应用parsel xpath采集构建代理ip池; selenium采集，综合应用反爬; python用BeautifulSoup与openpyxl爬小说网并保存成excel