import requests import re def getHTMLText(url): headers = { "cookie": "enc=rhkdBuATegC%2Bei%2FOyoznNhbQnMfVx%2Fmwc1WI%2BanFMOku5X39Cr7U2tOYm5ddcg5%2FEq9rpBgkEGwD%2FFh4RDNCRQ%3D%3D;", 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', } r = requests.get(url, headers=headers, timeout=30) return r.text def parsePage(ilt, html): plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html) tlt = re.findall(r'\"raw_title\"\:\".*?\"', html) for i in range(len(plt)): price = eval(plt[i].split(':')[1]) title = eval(tlt[i].split(':')[1]) ilt.append([price, title]) def printGoodsList(ilt): tplt = "{:4}\t{:8}\t{:16}" print(tplt.format("序号", "价格", "商品名称")) count = 0 for g in ilt: count = count + 1 print(tplt.format(count, g[0], g[1])) def main(): goods = '女生' depth = 20 start_url = 'https://s.taobao.com/search?q=' + goods infoList = [] for i in range(depth): try: url = start_url + '&s=' + str(44 * i) html = getHTMLText(url) parsePage(infoList, html) infoList = sorted(infoList, key=lambda x: int(x[0])) except: continue infoList = sorted(infoList,key=lambda x:float(x[0])) printGoodsList(infoList) main()
python正则爬淘宝搜索数据
阅读:3566 输入:2020-05-11 17:50:36