Python · 26 10 月, 2024 0

python使用xpath获取豆瓣电影排行榜数据

安装依赖插件

python -m pip install requests
python -m pip install lxml

实现代码

# 豆瓣排行榜数据抓取
import re

import requests
from lxml import etree

from baidu_fanyi_test import check_resp

req_url = "https://movie.douban.com/chart"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0"
}


def get_html(url):
    r = requests.get(url, headers=headers)
    check_resp(r)
    return r.text


def get_chart(html):
    ele = etree.HTML(html)
    tables = ele.xpath("//div[@class='indent']/div/table")
    movies = []
    for table in tables:
        movie_infos = table.xpath(".//tr/td[@valign='top']/div[@class='pl2']")
        if movie_infos is None:
            continue
        movie_info = movie_infos[0]
        title = movie_info.xpath(".//a/text()")[0] + movie_info.xpath(".//span/text()")[0]

        title = re.sub(r'[\n\s]+', ' ', title)
        detail = movie_info.xpath(".//p[@class='pl']/text()")[0]
        spans = movie_info.xpath(".//div[contains(@class, 'star')]/span")
        score = spans[1].text
        people = spans[2].text

        info = {
            "title": title,
            "detail": detail,
            "score": score,
            "people": people
        }

        movies.append(info)

    return movies


if __name__ == '__main__':
    movies = get_chart(get_html(req_url))
    for movie in movies:
        print(f"----------{movie['title']}------------")
        print(f"电影名称: {movie['title']}")
        print(f"电影内容: {movie['detail']}")
        print(f"电影评分: {movie['score']}/{movie['people']}")

 

以上就是获取豆瓣电影排行数据的代码,这里只是实现了基本的获取,没有去爬取更多的数据和电影的详情,大家可以自行扩展