1.错误详情

Cannot establish TLS with client (sni: app.1hai.cn): TlsException("SSL handshake error: Error([('SSL routines', 'ssl3_read_bytes', 'sslv3 alert certificate unknown')],)",)

其原因是mitmproxy缺少https的证书,导致mitmproxy解析ssl协议出错。忽略证书也没用。

2.解决方案

把tls_passthrough.py中的代码整合进监听脚本即可。启动方式如下(若不指定端口,可省略):

指定端口:mitmdump -p -s tls_passthrough.py或者mitmweb -p -s tls_passthrough.py

再在另一终端运行selenium代码。

3.tls_passthrough.py代码

# -*- coding: utf-8 -*-

"""
This inline script allows conditional TLS Interception based
on a user-defined strategy.
Example:
    > mitmdump -s tls_passthrough.py
    1. curl --proxy http://localhost:8080 https://example.com --insecure
    // works - we'll also see the contents in mitmproxy
    2. curl --proxy http://localhost:8080 https://example.com --insecure
    // still works - we'll also see the contents in mitmproxy
    3. curl --proxy http://localhost:8080 https://example.com
    // fails with a certificate error, which we will also see in mitmproxy
    4. curl --proxy http://localhost:8080 https://example.com
    // works again, but mitmproxy does not intercept and we do *not* see the contents
Authors: Maximilian Hils, Matthew Tuusberg
"""
import collections
import random

from enum import Enum

import mitmproxy
from mitmproxy import ctx
from mitmproxy.exceptions import TlsProtocolException
from mitmproxy.proxy.protocol import TlsLayer, RawTCPLayer


class InterceptionResult(Enum):
    success = True
    failure = False
    skipped = None


class _TlsStrategy:
    """
    Abstract base class for interception strategies.
    """

    def __init__(self):
        # A server_address -> interception results mapping
        self.history = collections.defaultdict(lambda: collections.deque(maxlen=500))

    def should_intercept(self, server_address):
        """
        Returns:
            True, if we should attempt to intercept the connection.
            False, if we want to employ pass-through instead.
        """
        raise NotImplementedError()

    def record_success(self, server_address):
        self.history[server_address].append(InterceptionResult.success)

    def record_failure(self, server_address):
        self.history[server_address].append(InterceptionResult.failure)

    def record_skipped(self, server_address):
        self.history[server_address].append(InterceptionResult.skipped)


class ConservativeStrategy(_TlsStrategy):
    """
    Conservative Interception Strategy - only intercept if there haven't been any failed attempts
    in the history.
    """

    def should_intercept(self, server_address):
        if InterceptionResult.failure in self.history[server_address]:
            return False
        return True


class ProbabilisticStrategy(_TlsStrategy):
    """
    Fixed probability that we intercept a given connection.
    """

    def __init__(self, p):
        self.p = p
        super(ProbabilisticStrategy, self).__init__()

    def should_intercept(self, server_address):
        return random.uniform(0, 1) < self.p


class TlsFeedback(TlsLayer):
    """
    Monkey-patch _establish_tls_with_client to get feedback if TLS could be established
    successfully on the client connection (which may fail due to cert pinning).
    """

    def _establish_tls_with_client(self):
        server_address = self.server_conn.address

        try:
            super(TlsFeedback, self)._establish_tls_with_client()
        except TlsProtocolException as e:
            tls_strategy.record_failure(server_address)
            raise e
        else:
            tls_strategy.record_success(server_address)


# inline script hooks below.

tls_strategy = None


def load(l):
    l.add_option(
        "tlsstrat", int, 0, "TLS passthrough strategy (0-100)",
    )


def configure(updated):
    global tls_strategy
    if ctx.options.tlsstrat > 0:
        tls_strategy = ProbabilisticStrategy(float(ctx.options.tlsstrat) / 100.0)
    else:
        tls_strategy = ConservativeStrategy()


def next_layer(next_layer):
    """
    This hook does the actual magic - if the next layer is planned to be a TLS layer,
    we check if we want to enter pass-through mode instead.
    """
    if isinstance(next_layer, TlsLayer) and next_layer._client_tls:
        server_address = next_layer.server_conn.address

        if tls_strategy.should_intercept(server_address):
            # We try to intercept.
            # Monkey-Patch the layer to get feedback from the TLSLayer if interception worked.
            next_layer.__class__ = TlsFeedback
        else:
            # We don't intercept - reply with a pass-through layer and add a "skipped" entry.
            mitmproxy.ctx.log("TLS passthrough for %s" % repr(next_layer.server_conn.address), "info")
            next_layer_replacement = RawTCPLayer(next_layer.ctx, ignore=True)
            next_layer.reply.send(next_layer_replacement)
            tls_strategy.record_skipped(server_address)

4.selenium爬虫代码示例(模拟翻页)

from selenium import webdriver
from bs4 import BeautifulSoup
import time
option = webdriver.Options()

option.add_argument('--no-sandbox')
option.add_argument('--disable-dev-shm-usage')
option.add_argument('--proxy-server=http://127.0.0.1:3215')

driver = webdriver.Chrome(executable_path='chromedriver', options=option)
url='https://hotel.tuniu.com/list/2500p0s0b0?checkindate=2021-05-31&checkoutdate=2021-06-01&cityName=%E4%B8%8A%E6%B5%B7&suggest=%7B%22code%22%3A2014281,%22name%22%3A%22%E5%B7%A8%E5%B3%B0%E8%B7%AF%E4%B8%8A%E6%B5%B7%E5%9C%B0%E9%93%816%E5%8F%B7%E7%BA%BF%20,%E4%B8%8A%E6%B5%B7,%E4%B8%AD%E5%9B%BD%22,%22type%22%3A13,%22catalog%22%3A2,%22cityCode%22%3A2500,%22longitude%22%3A121.588524,%22latitude%22%3A31.280576,%22chineseName%22%3A%22%E5%B7%A8%E5%B3%B0%E8%B7%AF%E4%B8%8A%E6%B5%B7%E5%9C%B0%E9%93%816%E5%8F%B7%E7%BA%BF%20%22,%22cityType%22%3A0,%22typeName%22%3A%22%E5%9C%B0%E9%93%81%22,%22cityName%22%3A%22%E4%B8%8A%E6%B5%B7%22,%22countryCode%22%3A40002,%22countryName%22%3A%22%E4%B8%AD%E5%9B%BD%22,%22hotelCount%22%3A391,%22score%22%3A62.15348,%22domestic%22%3Atrue,%22border%22%3Atrue%7D'
driver.get(url)
page_src = driver.page_source
bs = BeautifulSoup(page_src, "html.parser")

def click_next(driver, page_source, city):
    ## 变量n表示爬取进度
    n = 1
    print('正在爬取', city, '酒店信息', '第%d页' % n)
    ## 假如该城市只有1页的酒店数据,下面的while循环不执行
    if 'arrowR"' in page_source:
        flag = True
    else:
        flag = False
    while flag:
        page_source = driver.page_source
        if 'arrowR"' not in page_source:
            break
        next_elem = driver.find_element_by_class_name('arrowR')
        next_elem.click()

        time.sleep(20)
        #page_source = driver.page_source
        #xhtml       = get_xhtml(page_source)
        #datas       = parse_datas(xhtml, city)

        n += 1
        print('正在爬取', city, '酒店信息', '第%d页' % n)
        ##这个是判定条件,判断时候还存在下一页
    print(city, '爬取完成', '共%d页' % n)
    driver.quit()
click_next(driver,page_src,'shanghai')
#print(driver.page_source)
time.sleep(100)
driver.quit()