主页 M

python应用etree爬下厨房面向对象写法,自定义输入

2022-04-28 网页编程网 网页编程网
# -*- coding: utf-8 -*-
import requests
from lxml import etree
from fake_useragent import UserAgent
import time

class kitchen(object):
    u = 0

    def __init__(self):
        self.url = "https://www.xiachufang.com/category/40076/"
        ua = UserAgent(verify_ssl=False)
        for i in range(1, 50):
            self.headers = {
                'User-Agent': ua.random,

            }

    '''发送请求  获取响应'''

    def get_page(self, url):
        res = requests.get(url=url, headers=self.headers)
        html = res.content.decode("utf-8")
        time.sleep(2)
        return html

    def parse_page(self, html):
        parse_html = etree.HTML(html)
        image_src_list = parse_html.xpath('//li/div/a/@href')
        for i in image_src_list:
            try:
                url = "https://www.xiachufang.com" + i
                # print(url)
                html1 = self.get_page(url)  # 第二个发生请求
                parse_html1 = etree.HTML(html1)
                # print(parse_html1)
                num = parse_html1.xpath('.//h2[@id="steps"]/text()')[0].strip()

                name = parse_html1.xpath('.//li[@class="container"]/p/text()')
                ingredients = parse_html1.xpath('.//td//a/text()')
                self.u += 1
                # print(self.u)
                # print(str(self.u)+"."+house_dict["名 称 :"]+":")
                # da=tuple(house_dict["材 料:"])
                food_info = '''  
    第 %s 种
    
    菜 名 : %s
    原 料 : %s
    下 载 链 接 : %s,
    ======
                        ''' % (str(self.u), num, ingredients, url)
                # print(food_info)

                f = open('下厨房菜谱.txt', 'a', encoding='utf-8')
                f.write(str(food_info))
                print(str(food_info))
                f.close()
            except:
                print('xpath没获取到内容!')

    def main(self):
        startPage = int(input("起始页:"))
        endPage = int(input("终止页:"))
        for page in range(startPage, endPage + 1):
            url = self.url.format(page)
            html = self.get_page(url)
            self.parse_page(html)
            time.sleep(2.4)
            print("=第 %s 页爬取成功=" % page)

if __name__ == '__main__':
    imageSpider = kitchen()
    imageSpider.main()
阅读原文
阅读 2498
123 显示电脑版