python之爬取猫眼电影TOP100

# -*- coding:utf-8 -*-
import requests
import re
import json


def get_one_page(url):
    try:
        respone = requests.get(url)
        if respone.status_code == 200:
            return respone.text
        return None
    except Exception:
        return None

def write_to_file(conetxt):
    with open('result.txt', 'a') as f:
        f.write(json.dumps(conetxt,ensure_ascii=False) + '\n')
        f.close()




def parse_one_page(html):
    # print(html)
    pattern = re.compile('
.*?board-index.*?>(\d+).*?data-src="(.*?)".*?name"> +'.*?>(.*?).*?star">(.*?)

.*?releasetime">(.*?)

'
+'.*?integer">(.*?).*?fraction">(.*?).*?
'
, re.S) # pattern = re.compile(".*

(.*?)

.*", re.S)
# items = re.findall(pattern, html) # print type(items), len(items) # print(items[0]) items = re.findall(pattern,html) # print(items[0]) for item in items: yield { "index":item[0].encode("utf-8"), "image":item[1].encode("utf-8"), "title":item[2].encode("utf-8"), "actor":item[3].strip()[3:].encode("utf-8"), "time":item[4].strip()[5:].encode("utf-8"), "socre":item[5].encode("utf-8")+item[6].encode("utf-8") } # for i in items: # for j in i: # print(j) def main(offset): url = "http://maoyan.com/board/4?offset="+ str(offset) html = get_one_page(url) for item in parse_one_page(html): print(item) write_to_file(item) if __name__ == '__main__': for i in range(10): main(i*10)
 
  
代码示例,使用2.7版本。注意字节码,在list中之输出ascii码,需要使用json格式将其输出改为utf-8
在yield追加encode编码格式

你可能感兴趣的:(日记)