#coding=utf-8 import requests import re import random import threading from queue import Queue # 模拟UA头 USERAGENT = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"] # 模拟referer头 REFERERS = [ "https://www.baidu.com", "http://www.baidu.com", "https://www.google.com.hk", "http://www.so.com", "http://www.sogou.com", "http://www.soso.com", "http://www.bing.com", ] def get_proxy_ip(ip_queue): proxy_url = "http://www.66ip.cn/nmtq.php?getnum=10000" headers = { 'User-Agent': random.choice(USERAGENT), 'Accept': 'Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Cache-Control': 'max-age=0', 'referer': random.choice(REFERERS), 'Accept-Charset': 'GBK,utf-8;q=0.7,*;q=0.3', } resp = requests.get(proxy_url, headers=headers) if resp.status_code == 200: proxy_url_list = re.findall("(?:[0-9]{1,3}\.){3}[0-9]{1,3}:*[0-9]*", resp.text) for proxy_url in proxy_url_list: proxy = {} proxy['https'] = proxy_url # print(proxy) try: resp = requests.head(url="http://120.79.66.58/", timeout=3, headers=headers, verify=False, allow_redirects=False, proxies=proxy) if resp.status_code == 200: # print("获取成功ip: %s" % proxy['https']) ip_queue.put(proxy) except: pass else: print('代理ip网站非200,获取代理ip失败!') exit(0) class DirScan(threading.Thread): def __init__(self, url, ip_queue, path_queue): super(DirScan, self).__init__() self.url = url self.ip_queue = ip_queue self.path_queue = path_queue def run(self): if self.url[-1] != '/': self.url += '/' for i in range(self.path_queue.qsize()): headers = { 'User-Agent': random.choice(USERAGENT), 'Accept': 'Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Cache-Control': 'max-age=0', 'referer': random.choice(REFERERS), 'Accept-Charset': 'GBK,utf-8;q=0.7,*;q=0.3', } proxy_ip = self.ip_queue.get() path = self.path_queue.get() try: resp = requests.head(url=self.url + path, timeout=3, headers=headers, verify=False, allow_redirects=False, proxies=proxy_ip) if resp.status_code == 200: print("[*状态码:200] 存在的路径: %s 代理的ip为: %s" % (self.url + path, proxy_ip)) elif resp.status_code == 403: print("[*状态码:403] 存在的路径: %s 代理的ip为: %s" % (self.url + path, proxy_ip)) elif resp.status_code == 302: print("[*状态码:302] 存在的路径: %s 代理的ip为: %s" % (self.url + path, proxy_ip)) elif resp.status_code == 401: print("[*状态码:401] 存在的路径: %s 代理的ip为: %s" % (self.url + path, proxy_ip)) else: # pass print(self.url + path) except: pass print("扫描完成!") if '__main__' == __name__: queue = Queue() path_queue = Queue() with open(__file__[0:-19] + 'dicc.txt', 'r') as f: for path in f.readlines(): path_queue.put(path.strip()) thread_list = list() threading.Thread(target=get_proxy_ip,args=(queue,)).start() scan_url = "http://www.willsemi.com/" for i in range(25): thread_list.append(DirScan(scan_url, queue, path_queue)) for i in range(25): thread_list[i].start() for i in range(25): thread_list[i].join()
python3建代理IP池,模拟referer头,多线程根据字典扫描网站目录
阅读:4447 输入:2021-07-20 20:55:36