淘宝网页面解析

以下代码是我用在我的个人网站项目上用于解析淘宝页面的,网站(fuckinstall.com)的主要功能是将几个搜索引擎的内容后台整合在一起,结果经过相似度排序及聚类处理。顺带还做了个谷歌镜像的页面,前端是真心不太会。。


#coding=utf8
from ..common import crawlerTool as ct
from HTMLParser import HTMLParser#这个出来是unicode的格式,后面没法弄
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import re
import traceback
import json
# 摘取所要数据


def process(keyword,page):
	url='https://s.taobao.com/search?q=%s&s=%s' % (keyword, (page-1)*44)
	urlinsfos=[]
	page = ct.crawlerTool.getPage(url)

	g_page_config =ct.crawlerTool.getRegex('g_page_config\s*=\s*(.*);',page)
	#print eval(g_page_config)['mod']['data']['auctions']
	try:
		segments = json.loads(g_page_config)['mods']['itemlist']['data']['auctions']  #搜索微波炉就不用这个了
	except:
		segments = []
	if segments:
		#print segments[0]
		for segment in segments:
			try:
				#print segment
				urlinfo={}
				urlinfo['url']='https://detail.tmall.com/item.htm?id='+segment['nid']
				urlinfo['title'] = segment['raw_title']
				if 'tmall' in urlinfo['url']:
					urlinfo['title']=urlinfo['title']+'-天猫'
					urlinfo['source'] = 'tmall'
				else:
					urlinfo['title'] = urlinfo['title'] + '-淘宝'
					urlinfo['source'] = 'taobao'
				num=segment.get('view_sales','0')
				price = segment["view_price"]
				urlinfo['info'] = '价格%s元 购买数量%s'%(price,num)
				urlinfo['imglink'] = segment["pic_url"]

				#print urlinfo['url'], urlinfo['title'], urlinfo['info'],urlinfo['imglink']
				urlinsfos.append(urlinfo)
			except:
				traceback.print_exc()


	else:
		segments = json.loads(g_page_config)['mods']['grid']['data']['spus']
		for segment in segments:
			try:
				#print segment
				urlinfo={}
				urlinfo['url']=segment['url']
				urlinfo['title'] = segment['title']
				if 'tmall' in urlinfo['url']:
					urlinfo['title']=urlinfo['title']+'-天猫'
					urlinfo['source'] = 'tmall'
				else:
					urlinfo['title'] = urlinfo['title'] + '-淘宝'
					urlinfo['source'] = 'taobao'
				importantKey = segment['importantKey']
				price = segment["price"]
				urlinfo['info'] = '价格%s%s '%(price,importantKey)
				urlinfo['imglink'] = segment["pic_url"]

				#print urlinfo['url'], urlinfo['title'], urlinfo['info'],urlinfo['imglink']
				urlinsfos.append(urlinfo)
			except:
				traceback.print_exc()



	return urlinsfos



def test():
	return process("https://s.taobao.com/search?q=python")


你可能感兴趣的:(淘宝网页面解析)