安装依赖插件
python -m pip install requests python -m pip install lxml
实现代码
# 豆瓣排行榜数据抓取 import re import requests from lxml import etree from baidu_fanyi_test import check_resp req_url = "https://movie.douban.com/chart" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0" } def get_html(url): r = requests.get(url, headers=headers) check_resp(r) return r.text def get_chart(html): ele = etree.HTML(html) tables = ele.xpath("//div[@class='indent']/div/table") movies = [] for table in tables: movie_infos = table.xpath(".//tr/td[@valign='top']/div[@class='pl2']") if movie_infos is None: continue movie_info = movie_infos[0] title = movie_info.xpath(".//a/text()")[0] + movie_info.xpath(".//span/text()")[0] title = re.sub(r'[\n\s]+', ' ', title) detail = movie_info.xpath(".//p[@class='pl']/text()")[0] spans = movie_info.xpath(".//div[contains(@class, 'star')]/span") score = spans[1].text people = spans[2].text info = { "title": title, "detail": detail, "score": score, "people": people } movies.append(info) return movies if __name__ == '__main__': movies = get_chart(get_html(req_url)) for movie in movies: print(f"----------{movie['title']}------------") print(f"电影名称: {movie['title']}") print(f"电影内容: {movie['detail']}") print(f"电影评分: {movie['score']}/{movie['people']}")
以上就是获取豆瓣电影排行数据的代码,这里只是实现了基本的获取,没有去爬取更多的数据和电影的详情,大家可以自行扩展