爬虫爬取猫眼电影top100排名并保存本地Excel或txt

import requests,re
from openpyxl.workbook import Workbook
from openpyxl.writer.excel import ExcelWriter


def get_page(url):
    headers = {
    "User-Agent":"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; de-de) AppleWebKit/534.15+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4"
    }
    res = requests.get(url,headers=headers)
    return res.text


def parse_page(html):
    pattern = re.compile(
        "
.*?board-index.*?>(.*?).*?name.*?>(.*?).*?star.*?>\\n(.*?)

.*?releasetime.*?>(.*?)

.*?integer.*?>(.*?)(.*?)

", re.S ) items = re.findall(pattern,html) # for item in items: # print(item) return items def parse_all(): top_100 = [] for i in range(10): url = "https://maoyan.com/board/4?offset={0}".format(i*10) html = get_page(url) item_list = parse_page(html) top_100.extend(item_list) return top_100 print(top_100) def save_list(): top_100 = parse_all() wb = Workbook() ws = wb.worksheets[0] ws.title = "sheet1" ws.cell(row=1, column=1).value = "排名" ws.cell(row=1, column=2).value = "电影名" ws.cell(row=1, column=3).value = "主演" ws.cell(row=1, column=4).value = "上映时间" ws.cell(row=1, column=5).value = "评分" for i in range(1,len(top_100)+1): data_list = top_100[i-1] for j in range(1,len(data_list)): val = data_list[j-1] if j == 3:#正则表达取主演信息式格式有问题,这里调整一下 val = data_list[j - 1].strip(' ') elif j == 5:#评分整数和分数是分开的,这里合到一起 val = data_list[j - 1] + data_list[j] ws.cell(row=i+1, column=j).value = val wb.save("猫眼top100电影列表.xlsx") def main(): save_list() if __name__ == "__main__": main()

爬虫爬取猫眼电影top100排名并保存本地Excel或txt_第1张图片

import requests,re
#结果保存为txt格式


def get_page(url):
    headers = {
    "User-Agent":"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; de-de) AppleWebKit/534.15+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4"
    }
    res = requests.get(url,headers=headers)
    return res.text


def parse_page(html):
    pattern = re.compile(
        "
.*?board-index.*?>(.*?).*?name.*?>(.*?).*?star.*?>\\n(.*?)

.*?releasetime.*?>(.*?)

.*?integer.*?>(.*?)(.*?)

", re.S ) items = re.findall(pattern,html) # for item in items: # print(item) return items def parse_all(): top_100 = [] for i in range(10): url = "https://maoyan.com/board/4?offset={0}".format(i*10) html = get_page(url) item_list = parse_page(html) top_100.extend(item_list) return top_100 def save_list(): top_100 = parse_all() for i in range(0,len(top_100)): data_list = top_100[i] file = open("猫眼电影Top100排行榜.txt","a",encoding="utf-8") file.write("\n".join([data_list[0],data_list[1],data_list[2].strip(),data_list[3],data_list[4] + data_list[5]])) file.write("\n" + "=" * 50 + "\n") file.close() def main(): save_list() if __name__ == "__main__": main()

你可能感兴趣的:(笔记)