python应用xpath爬做菜网并保存成json

2020-05-05 网页编程网 网页编程网
import requests
from lxml import etree
import json

base_url = "http://www.xiachufang.com"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36"
}

count = None
contents = None
item = {}
shicai_list = []

def get_link():
    link_list = []
    url = "http://www.xiachufang.com/category/957/?page=%s"
    for i in range(1,12):
        response = requests.get(url=url%i, headers=headers)
        html = etree.HTML(response.text)
        links = html.xpath('//div[@class="ing-recipe"]//div[@class="normal-recipe-list"]//p[@class="name"]/a/@href')
        for i in links:
            link = base_url + i
            link_list.append(link)
    return link_list

def get_info():
    link_list = get_link()
    for link in link_list:
        response2 = requests.get(url=link,headers=headers)
        html = etree.HTML(response2.text)
        title = html.xpath('//h1[@class="page-title"]/text()')[0].strip()
        yongliao = html.xpath('//table//tr')
        img_src = html.xpath('//img[@itemprop="image"]/@src')[0]
        img_response = requests.get(img_src,headers=headers)
        if "|" in title:
            title = title.replace("|","_")
        if "/" in title:
            title = title.replace("/", "_")
        print("正在下载：%s..."%title)
        with open("images/"+title+".jpg","wb") as f:
            f.write(img_response.content)
        for cailiao in yongliao:
            Materials_used = cailiao.xpath('./td[1]/a/text()')
            if not Materials_used:
                pass
            else:
                Materials_used = Materials_used[0]
                count = cailiao.xpath('./td[2]/text()')
                if not count:
                    count = "适量"
                else:
                    count = count[0].strip()
                    global contents
                    contents = cailiao.xpath('//li[@class="container"]/p/text()')
                    shicai_list.append(Materials_used + "    "+count)
        item["菜名"] = title
        item["食材用量"] = shicai_list
        item["步骤"] = contents
        f = open("caipu.json","a",encoding="utf-8")
        f.write(json.dumps(item,ensure_ascii=False,indent=4)+",\n")
        f.close()

if __name__ == '__main__':
    get_info()
阅读原文
阅读 3819
123 显示电脑版