01爬虫实例-豆瓣top250

发布于 2022年 01月 21日 03:09

一、豆瓣top250

这是我参与更文挑战的第3天,活动详情查看: 更文挑战

# 豆瓣top250
# 获取 电影名称,影评和图片
import requests
from lxml import etree


class DouBan(object):
    def __init__(self, url):
        self.url = url
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}

    def parse_url(self):  # 获取源码
        response = requests.get(url=self.url, headers=self.headers)
        return response.content.decode()

    def parse_str(self, sound_code):  # 提取数据
        html = etree.HTML(sound_code)
        li_list = html.xpath('//*[@id="content"]/div/div[1]/ol/li')
        item = []
        for li in li_list:
            title = li.xpath('div/div[2]/div[1]/a/span[1]/text()')
            film_review = li.xpath('div/div[2]/div[2]/p[2]/span/text()')
            if film_review == None:
                continue
            img = li.xpath('div/div[1]/a/img/@src')
            item.append(title)
            item.append(film_review)
            item.append(img)
        return item

    def save_html(self, content_list):  # 保存页面的源码
        with open("douban.html", "w", encoding="utf-8")as f:
            f.write(content_list)

    def run(self):  # 启动
        # while True:
        sound_code = self.parse_url()
        self.save_html(sound_code)
        content_list = self.parse_str(sound_code)

        # save = self.save_str(content_list)
        print(content_list)


if __name__ == '__main__':
    for i in range(0, 226, 25):
        url = "https://movie.douban.com/top250?start=" + str(i)
        spider = DouBan(url)
        spider.run()

推荐文章