python html抓取多页数据,并用re解析(二)

#!/usr/bin/env python3
#-*- coding: utf-8 -*-

import urllib.request
import re

'''
url = "http://search.jd.com/Search?keyword=%E5%B9%BC%E7%8C%AB%E7%8C%AB%E7%B2%AE&enc=utf-8#filter"

print jd_search(keyword)

[dict,dict,dict]
dict {pic:'',title:'',price:'',url:''}
从第page_skip页一共爬取page_limit页的数据
'''


import os
import threading

def jd_search(keyword, page_skip = 1, page_limit = 1):
    #首先将中文的keyword解析成程序可以识别
    keyword = urllib.parse.quote(keyword)
    #查看网页翻页规则后,发现是通过js进行翻页;观察后发现,可以将原url修改为可以翻页的url。获得需要解析的url列表
    url_list = []
    i = 0
    while i < page_limit:
        page_no = page_skip + i * 2
        url = "http://search.jd.com/Search?keyword={}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&stock=1&page={}&s=1&click=0".format(keyword, page_no)
        url_list.append(url)
        i += 1

    n = 1

    for url in url_list:

    	print ("当前抓取的第{}个Url是: {}".format(n, url))

    	result = []

    	content = urllib.request.urlopen(url).read()

    	content_data = content.decode('utf-8')

    	pattern = re.compile(r'.*?
',re.S) basic_content = re.finditer(pattern,content_data) for i in basic_content: init_dict = {} match_content = re.match(r'.*?
.*?source-data-lazy-img="(.*?)".*?
.*?(.*?)(.*?).*?
.*?title="(.*?)" href="(.*?)".*?
',i.group(),re.S) init_dict['pic'] = match_content.group(1) init_dict['title'] = match_content.group(4) init_dict['price'] = match_content.group(2) + match_content.group(3) init_dict['url'] = 'http' + match_content.group(5) result.append(init_dict) print (result) n += 1

测试

jd_search('幼猫猫粮', page_skip=1,page_limit=10)

也可以引入scrapy模块来实现

你可能感兴趣的:(python学习)