主页 M

python正则爬淘宝搜索数据

2020-05-11 网页编程网 网页编程网
import requests
import re

def getHTMLText(url):
    headers = {
        "cookie": "enc=rhkdBuATegC%2Bei%2FOyoznNhbQnMfVx%2Fmwc1WI%2BanFMOku5X39Cr7U2tOYm5ddcg5%2FEq9rpBgkEGwD%2FFh4RDNCRQ%3D%3D;",
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
    }

    r = requests.get(url, headers=headers, timeout=30)
    return r.text

def parsePage(ilt, html):
    plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
    tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)
    for i in range(len(plt)):
        price = eval(plt[i].split(':')[1])
        title = eval(tlt[i].split(':')[1])
        ilt.append([price, title])

def printGoodsList(ilt):
    tplt = "{:4}\t{:8}\t{:16}"
    print(tplt.format("序号", "价格", "商品名称"))
    count = 0
    for g in ilt:
        count = count + 1
        print(tplt.format(count, g[0], g[1]))

def main():
    goods = '女生'
    depth = 20
    start_url = 'https://s.taobao.com/search?q=' + goods
    infoList = []
    for i in range(depth):
        try:
            url = start_url + '&s=' + str(44 * i)
            html = getHTMLText(url)
            parsePage(infoList, html)
            infoList = sorted(infoList, key=lambda x: int(x[0]))
        except:
            continue
    infoList = sorted(infoList,key=lambda x:float(x[0]))
    printGoodsList(infoList)

main()
阅读原文
阅读 3570
123 显示电脑版