Cannot establish TLS with client (sni: app.1hai.cn): TlsException("SSL handshake error: Error([('SSL routines', 'ssl3_read_bytes', 'sslv3 alert certificate unknown')],)",)
其原因是mitmproxy缺少https的证书,导致mitmproxy解析ssl协议出错。忽略证书也没用。
把tls_passthrough.py中的代码整合进监听脚本即可。启动方式如下(若不指定端口,可省略):
指定端口:mitmdump -p
再在另一终端运行selenium代码。
# -*- coding: utf-8 -*-
"""
This inline script allows conditional TLS Interception based
on a user-defined strategy.
Example:
> mitmdump -s tls_passthrough.py
1. curl --proxy http://localhost:8080 https://example.com --insecure
// works - we'll also see the contents in mitmproxy
2. curl --proxy http://localhost:8080 https://example.com --insecure
// still works - we'll also see the contents in mitmproxy
3. curl --proxy http://localhost:8080 https://example.com
// fails with a certificate error, which we will also see in mitmproxy
4. curl --proxy http://localhost:8080 https://example.com
// works again, but mitmproxy does not intercept and we do *not* see the contents
Authors: Maximilian Hils, Matthew Tuusberg
"""
import collections
import random
from enum import Enum
import mitmproxy
from mitmproxy import ctx
from mitmproxy.exceptions import TlsProtocolException
from mitmproxy.proxy.protocol import TlsLayer, RawTCPLayer
class InterceptionResult(Enum):
success = True
failure = False
skipped = None
class _TlsStrategy:
"""
Abstract base class for interception strategies.
"""
def __init__(self):
# A server_address -> interception results mapping
self.history = collections.defaultdict(lambda: collections.deque(maxlen=500))
def should_intercept(self, server_address):
"""
Returns:
True, if we should attempt to intercept the connection.
False, if we want to employ pass-through instead.
"""
raise NotImplementedError()
def record_success(self, server_address):
self.history[server_address].append(InterceptionResult.success)
def record_failure(self, server_address):
self.history[server_address].append(InterceptionResult.failure)
def record_skipped(self, server_address):
self.history[server_address].append(InterceptionResult.skipped)
class ConservativeStrategy(_TlsStrategy):
"""
Conservative Interception Strategy - only intercept if there haven't been any failed attempts
in the history.
"""
def should_intercept(self, server_address):
if InterceptionResult.failure in self.history[server_address]:
return False
return True
class ProbabilisticStrategy(_TlsStrategy):
"""
Fixed probability that we intercept a given connection.
"""
def __init__(self, p):
self.p = p
super(ProbabilisticStrategy, self).__init__()
def should_intercept(self, server_address):
return random.uniform(0, 1) < self.p
class TlsFeedback(TlsLayer):
"""
Monkey-patch _establish_tls_with_client to get feedback if TLS could be established
successfully on the client connection (which may fail due to cert pinning).
"""
def _establish_tls_with_client(self):
server_address = self.server_conn.address
try:
super(TlsFeedback, self)._establish_tls_with_client()
except TlsProtocolException as e:
tls_strategy.record_failure(server_address)
raise e
else:
tls_strategy.record_success(server_address)
# inline script hooks below.
tls_strategy = None
def load(l):
l.add_option(
"tlsstrat", int, 0, "TLS passthrough strategy (0-100)",
)
def configure(updated):
global tls_strategy
if ctx.options.tlsstrat > 0:
tls_strategy = ProbabilisticStrategy(float(ctx.options.tlsstrat) / 100.0)
else:
tls_strategy = ConservativeStrategy()
def next_layer(next_layer):
"""
This hook does the actual magic - if the next layer is planned to be a TLS layer,
we check if we want to enter pass-through mode instead.
"""
if isinstance(next_layer, TlsLayer) and next_layer._client_tls:
server_address = next_layer.server_conn.address
if tls_strategy.should_intercept(server_address):
# We try to intercept.
# Monkey-Patch the layer to get feedback from the TLSLayer if interception worked.
next_layer.__class__ = TlsFeedback
else:
# We don't intercept - reply with a pass-through layer and add a "skipped" entry.
mitmproxy.ctx.log("TLS passthrough for %s" % repr(next_layer.server_conn.address), "info")
next_layer_replacement = RawTCPLayer(next_layer.ctx, ignore=True)
next_layer.reply.send(next_layer_replacement)
tls_strategy.record_skipped(server_address)
from selenium import webdriver
from bs4 import BeautifulSoup
import time
option = webdriver.Options()
option.add_argument('--no-sandbox')
option.add_argument('--disable-dev-shm-usage')
option.add_argument('--proxy-server=http://127.0.0.1:3215')
driver = webdriver.Chrome(executable_path='chromedriver', options=option)
url='https://hotel.tuniu.com/list/2500p0s0b0?checkindate=2021-05-31&checkoutdate=2021-06-01&cityName=%E4%B8%8A%E6%B5%B7&suggest=%7B%22code%22%3A2014281,%22name%22%3A%22%E5%B7%A8%E5%B3%B0%E8%B7%AF%E4%B8%8A%E6%B5%B7%E5%9C%B0%E9%93%816%E5%8F%B7%E7%BA%BF%20,%E4%B8%8A%E6%B5%B7,%E4%B8%AD%E5%9B%BD%22,%22type%22%3A13,%22catalog%22%3A2,%22cityCode%22%3A2500,%22longitude%22%3A121.588524,%22latitude%22%3A31.280576,%22chineseName%22%3A%22%E5%B7%A8%E5%B3%B0%E8%B7%AF%E4%B8%8A%E6%B5%B7%E5%9C%B0%E9%93%816%E5%8F%B7%E7%BA%BF%20%22,%22cityType%22%3A0,%22typeName%22%3A%22%E5%9C%B0%E9%93%81%22,%22cityName%22%3A%22%E4%B8%8A%E6%B5%B7%22,%22countryCode%22%3A40002,%22countryName%22%3A%22%E4%B8%AD%E5%9B%BD%22,%22hotelCount%22%3A391,%22score%22%3A62.15348,%22domestic%22%3Atrue,%22border%22%3Atrue%7D'
driver.get(url)
page_src = driver.page_source
bs = BeautifulSoup(page_src, "html.parser")
def click_next(driver, page_source, city):
## 变量n表示爬取进度
n = 1
print('正在爬取', city, '酒店信息', '第%d页' % n)
## 假如该城市只有1页的酒店数据,下面的while循环不执行
if 'arrowR"' in page_source:
flag = True
else:
flag = False
while flag:
page_source = driver.page_source
if 'arrowR"' not in page_source:
break
next_elem = driver.find_element_by_class_name('arrowR')
next_elem.click()
time.sleep(20)
#page_source = driver.page_source
#xhtml = get_xhtml(page_source)
#datas = parse_datas(xhtml, city)
n += 1
print('正在爬取', city, '酒店信息', '第%d页' % n)
##这个是判定条件,判断时候还存在下一页
print(city, '爬取完成', '共%d页' % n)
driver.quit()
click_next(driver,page_src,'shanghai')
#print(driver.page_source)
time.sleep(100)
driver.quit()