1.错误详情
Cannot establish TLS with client (sni: app.1hai.cn): TlsException("SSL handshake error: Error([('SSL routines', 'ssl3_read_bytes', 'sslv3 alert certificate unknown')],)",)
其原因是mitmproxy缺少https的证书,导致mitmproxy解析ssl协议出错。忽略证书也没用。
2.解决方案
把tls_passthrough.py中的代码整合进监听脚本即可。启动方式如下(若不指定端口,可省略):
指定端口:mitmdump -p
再在另一终端运行selenium代码。
3.tls_passthrough.py代码
# -*- coding: utf-8 -*- """ This inline script allows conditional TLS Interception based on a user-defined strategy. Example: > mitmdump -s tls_passthrough.py 1. curl --proxy http://localhost:8080 https://example.com --insecure // works - we'll also see the contents in mitmproxy 2. curl --proxy http://localhost:8080 https://example.com --insecure // still works - we'll also see the contents in mitmproxy 3. curl --proxy http://localhost:8080 https://example.com // fails with a certificate error, which we will also see in mitmproxy 4. curl --proxy http://localhost:8080 https://example.com // works again, but mitmproxy does not intercept and we do *not* see the contents Authors: Maximilian Hils, Matthew Tuusberg """ import collections import random from enum import Enum import mitmproxy from mitmproxy import ctx from mitmproxy.exceptions import TlsProtocolException from mitmproxy.proxy.protocol import TlsLayer, RawTCPLayer class InterceptionResult(Enum): success = True failure = False skipped = None class _TlsStrategy: """ Abstract base class for interception strategies. """ def __init__(self): # A server_address -> interception results mapping self.history = collections.defaultdict(lambda: collections.deque(maxlen=500)) def should_intercept(self, server_address): """ Returns: True, if we should attempt to intercept the connection. False, if we want to employ pass-through instead. """ raise NotImplementedError() def record_success(self, server_address): self.history[server_address].append(InterceptionResult.success) def record_failure(self, server_address): self.history[server_address].append(InterceptionResult.failure) def record_skipped(self, server_address): self.history[server_address].append(InterceptionResult.skipped) class ConservativeStrategy(_TlsStrategy): """ Conservative Interception Strategy - only intercept if there haven't been any failed attempts in the history. """ def should_intercept(self, server_address): if InterceptionResult.failure in self.history[server_address]: return False return True class ProbabilisticStrategy(_TlsStrategy): """ Fixed probability that we intercept a given connection. """ def __init__(self, p): self.p = p super(ProbabilisticStrategy, self).__init__() def should_intercept(self, server_address): return random.uniform(0, 1) < self.p class TlsFeedback(TlsLayer): """ Monkey-patch _establish_tls_with_client to get feedback if TLS could be established successfully on the client connection (which may fail due to cert pinning). """ def _establish_tls_with_client(self): server_address = self.server_conn.address try: super(TlsFeedback, self)._establish_tls_with_client() except TlsProtocolException as e: tls_strategy.record_failure(server_address) raise e else: tls_strategy.record_success(server_address) # inline script hooks below. tls_strategy = None def load(l): l.add_option( "tlsstrat", int, 0, "TLS passthrough strategy (0-100)", ) def configure(updated): global tls_strategy if ctx.options.tlsstrat > 0: tls_strategy = ProbabilisticStrategy(float(ctx.options.tlsstrat) / 100.0) else: tls_strategy = ConservativeStrategy() def next_layer(next_layer): """ This hook does the actual magic - if the next layer is planned to be a TLS layer, we check if we want to enter pass-through mode instead. """ if isinstance(next_layer, TlsLayer) and next_layer._client_tls: server_address = next_layer.server_conn.address if tls_strategy.should_intercept(server_address): # We try to intercept. # Monkey-Patch the layer to get feedback from the TLSLayer if interception worked. next_layer.__class__ = TlsFeedback else: # We don't intercept - reply with a pass-through layer and add a "skipped" entry. mitmproxy.ctx.log("TLS passthrough for %s" % repr(next_layer.server_conn.address), "info") next_layer_replacement = RawTCPLayer(next_layer.ctx, ignore=True) next_layer.reply.send(next_layer_replacement) tls_strategy.record_skipped(server_address)
4.selenium爬虫代码示例(模拟翻页)
from selenium import webdriver from bs4 import BeautifulSoup import time option = webdriver.Options() option.add_argument('--no-sandbox') option.add_argument('--disable-dev-shm-usage') option.add_argument('--proxy-server=http://127.0.0.1:3215') driver = webdriver.Chrome(executable_path='chromedriver', options=option) url='https://hotel.tuniu.com/list/2500p0s0b0?checkindate=2021-05-31&checkoutdate=2021-06-01&cityName=%E4%B8%8A%E6%B5%B7&suggest=%7B%22code%22%3A2014281,%22name%22%3A%22%E5%B7%A8%E5%B3%B0%E8%B7%AF%E4%B8%8A%E6%B5%B7%E5%9C%B0%E9%93%816%E5%8F%B7%E7%BA%BF%20,%E4%B8%8A%E6%B5%B7,%E4%B8%AD%E5%9B%BD%22,%22type%22%3A13,%22catalog%22%3A2,%22cityCode%22%3A2500,%22longitude%22%3A121.588524,%22latitude%22%3A31.280576,%22chineseName%22%3A%22%E5%B7%A8%E5%B3%B0%E8%B7%AF%E4%B8%8A%E6%B5%B7%E5%9C%B0%E9%93%816%E5%8F%B7%E7%BA%BF%20%22,%22cityType%22%3A0,%22typeName%22%3A%22%E5%9C%B0%E9%93%81%22,%22cityName%22%3A%22%E4%B8%8A%E6%B5%B7%22,%22countryCode%22%3A40002,%22countryName%22%3A%22%E4%B8%AD%E5%9B%BD%22,%22hotelCount%22%3A391,%22score%22%3A62.15348,%22domestic%22%3Atrue,%22border%22%3Atrue%7D' driver.get(url) page_src = driver.page_source bs = BeautifulSoup(page_src, "html.parser") def click_next(driver, page_source, city): ## 变量n表示爬取进度 n = 1 print('正在爬取', city, '酒店信息', '第%d页' % n) ## 假如该城市只有1页的酒店数据,下面的while循环不执行 if 'arrowR"' in page_source: flag = True else: flag = False while flag: page_source = driver.page_source if 'arrowR"' not in page_source: break next_elem = driver.find_element_by_class_name('arrowR') next_elem.click() time.sleep(20) #page_source = driver.page_source #xhtml = get_xhtml(page_source) #datas = parse_datas(xhtml, city) n += 1 print('正在爬取', city, '酒店信息', '第%d页' % n) ##这个是判定条件,判断时候还存在下一页 print(city, '爬取完成', '共%d页' % n) driver.quit() click_next(driver,page_src,'shanghai') #print(driver.page_source) time.sleep(100) driver.quit()