# coding=utf-8
__author__ = 'taven'
from blog.models import CarBrand, CarCategory, CarType
from pyquery import PyQuery as pyq
import chardet
import requests
import sys
reload(sys)
import urllib
sys.setdefaultencoding("utf-8")
# pyq相当于jquery中的$
base_url = r'http://db.auto.sohu.com/'
res = requests.get(base_url).content
html = pyq(res)
car_category_list = html('.con .category_main')
category_url_json = {}
#判断字符是否为空或特殊字符
def isblank(ch):
if ch == u'\x20' or ch <= u'\x1f' or ch == u'\x7f' or ch == u'\xA0':
return True
return False
def func_model(model_string):
try:
encoding_dict = chardet.detect(model_string)
if encoding_dict['encoding'] != 'unicode' and \
encoding_dict['encoding'] != 'UNICODE':
model_string = unicode(model_string, encoding_dict['encoding'])
finally:
start_with = 0
end_with = len(model_string)
for idx, ch in enumerate(model_string):
if isblank(ch):
start_with = idx + 1
break
for idx, ch in enumerate(model_string[start_with:end_with]):
if ch >= u'\u4e00' or ch == u'(':
end_with = start_with + idx
break
return model_string[start_with:end_with]
for index, car_category in enumerate(car_category_list):
car_category_html = pyq(car_category)
car_brand_html = car_category_html.find('.meta_left p')
car_logo_html = car_category_html.find('.meta_left a img')
urllib.urlretrieve(pyq(car_logo_html).attr('data-original'),
'/home/taven/tblog/car_logo/'+'logo%s' %index)
# car_brand = CarBrand(name=str(pyq(car_brand_html).text()).decode('utf8'),)
# car_brand.save()
print '\n'
print '\n'
print '车品牌:::::::::::::::'+pyq(car_brand_html).text()
car_category_html_dt_list = car_category_html.\
find('dt span:first-child span a')
for car_category_html_dt in car_category_html_dt_list:
print "车系:::::::::::::::"+pyq(car_category_html_dt).text()
car_category_item = pyq(car_category_html_dt).parent().next().text()
# car_brand = CarBrand.objects.get(name=pyq(car_brand_html).text())
# car_type = CarType(name=pyq(car_category_html_dt).text(),
# car_brand_id=car_brand.id)
# car_type.save()
# print "车系href:::::::::::::::"+pyq(car_category_html_dt).attr('href')
category_url = r'http://db.auto.sohu.com'\
+pyq(car_category_html_dt).attr('href')
category_url_list = []
category_url_list.append(category_url)
# $(".stop_sell h4 a").each(function(){alert(this)});
i = 2001
while i <= 2015:
f = urllib.urlopen('http://db.auto.sohu.com'
+pyq(car_category_html_dt).attr('href')
+'year'+str(i)+'.shtml')
if f.geturl() != 'http://db.auto.sohu.com/index.shtml':
category_url_list.append(
r'http://db.auto.sohu.com'
+pyq(car_category_html_dt).attr('href')
+'year'+str(i)+'.shtml'
)
i += 1
car_category_name_list = []
for category_url in category_url_list:
print '\n'
print category_url
res_car_category = requests.get(category_url).content
if res_car_category is not None and len(res_car_category) > 10000:
car_category_html = pyq(res_car_category)
car_category_a_list = car_category_html('.b .ftdleft')
for car_category_a in car_category_a_list:
car_category_name = pyq(car_category_a).find("a:first").text()
handled_category_name = func_model(car_category_name)
if handled_category_name not in car_category_name_list:
car_category_name_list.append(handled_category_name)
print handled_category_name
# car_category = CarCategory(name=handled_category_name,
# car_type_id=car_type.id)
# car_category.save()
print 'ok'
# 测试数据
if __name__ == '__main__':
print func_model('2011款 1.4T Urban版')
print func_model('2013款 40 TFSI 进取型')
print func_model('2011款 1.4T Ego plus版') # 这种不能处理
print func_model('2002款 1.8i手动5速') # 这种不能处理
print func_model('2009款 2.0TFSI 豪华型')
print func_model('2012款 1.8TFSI MT舒适型')
print func_model('2010款 2.0TFSI 标准型')
print func_model('2014款 35 TFSI 进取型')
print func_model('2012款 40TFSI 越野型')
print func_model('2014款 35 TFSI 进取型')
print func_model('2013款 40 TFSI quattro运动型')
print func_model('2001款 1.8T手动5速基本型')
print func_model('2003款 2.4i手动5速')
print func_model('2003款 2.8i 无级手动一体技术领先型')
print func_model('2005款 2.0T FSI®手动标准型')
print func_model('2006款 L 07款 3.1 FSI 无级/手动一体')
print func_model('2007款 2.8 FSI 尊享型')
print func_model('2008款 2.4 技术型')
print func_model('2008款 3.2FSI quattro 豪华型')
print func_model('2010款 2.4L 豪华型')
print func_model('2011款 2.7TDI 舒适型')
print func_model('2012款 35 FSI quattro 豪华型(2.8L)')
print func_model('2015款 40 TFSI quattro 豪华型')
# 相关的model
class CarBrand(models.Model):
name = models.CharField(max_length=30) # 品牌名
company_id = models.IntegerField(default=0)
create_by = models.IntegerField(default=0)
create_date = models.DateTimeField(auto_now=True)
update_by = models.IntegerField(default=0)
update_date = models.DateTimeField(auto_now=True)
del_flag = models.BooleanField(default=False)
class Meta:
db_table = 'car_brand'
class CarType(models.Model):
name = models.CharField(max_length=30) # 车系名
company_id = models.IntegerField(default=0)
car_brand = models.ForeignKey(CarBrand)
create_by = models.IntegerField(default=0)
create_date = models.DateTimeField(auto_now=True)
update_by = models.IntegerField(default=0)
update_date = models.DateTimeField(auto_now=True)
del_flag = models.BooleanField(default=False)
class Meta:
db_table = 'car_type'
class CarCategory(models.Model):
name = models.CharField(max_length=30) # 车类别名
company_id = models.IntegerField(default=0)
car_type = models.ForeignKey(CarType)
create_by = models.IntegerField(default=0)
create_date = models.DateTimeField(auto_now=True)
update_by = models.IntegerField(default=0)
update_date = models.DateTimeField(auto_now=True)
del_flag = models.BooleanField(default=False)
class Meta:
db_table = 'car_category'