自从过完年之后没有发过什么文章了,一直在写一个快捷酒店房间采集程序,现在已经做完了七天连锁的和如家的房间信息采集,现在会陆续将代码贴上来
我的python工程的目录如下图所示:
程序是使用python2.7+mongod+threadpool完成的,所以你还需要一个mongodb数据库和下载一个threadpool线程池库
一,我们首先在resources目录下放入两个个xml文件,第一个文件为homeinns.xml,这是初始化如家酒店的相关城市的信息
加入文件完毕之后,我们需要在init包中加入文件province_city.xml,内容如下(这可是哥手动打上去的啊):
接着再在init包中创建一个recordPC.py文件,代码如下:
# -*- coding:utf-8 -*-
from xml.dom import minidom
from com.jhnet.spider.db.factory.MongoFactory import MongoFactory
def recordPc():
xmldoc = minidom.parse("province_city.xml")
ps = xmldoc.firstChild
p = ps.childNodes
for i in range(0, p.length):
pp = p.item(i)
if pp.nodeType == 1:
pname = pp.attributes['name'].value
ppinyin = pp.attributes['province_pinyinname'].value
ppCs = pp.childNodes.item(1)
ppC = ppCs.childNodes
for j in range(0, ppC.length):
ppCC = ppC.item(j)
if ppCC.nodeType == 1:
cname = ppCC.attributes['name'].value
cpinyin = ppCC.attributes['city_pinyinname'].value
MongoFactory.getDBCollection("province_city").insert({'province_name':pname,'province_pinyin':ppinyin,'city_name':cname,'city_pinyin':cpinyin})
由于这个文件是依赖于mongodb的,所以创建完成之后就先不管了。
二,在factory包中创建MongoFactory.py文件,代码如下(因为是数据连接,索引代码就不用加注释了):
# -*- coding: utf-8 -*-
import pymongo
class MongoFactory:
__conn = None
__db = None
def __init__(self):
pass
def initMongo(self,config):
mainMongoHost = config['mainMongo_host']
mainMongodbName=config['mainMongo_dbName']
mainMongoConnectionsPerHost = int(config['mainMongo_connectionsPerHost'])
mainMongoConnectTimeout = int(config['mainMongo_connectTimeout'])
mainMongoPort = int(config['mainMongo_port'])
MongoFactory.conn = pymongo.Connection(mainMongoHost, mainMongoPort,mainMongoConnectionsPerHost,mainMongoConnectTimeout)
MongoFactory.db = MongoFactory.conn[mainMongodbName]
@staticmethod
def getConn():
return MongoFactory.conn
@staticmethod
def closeConn():
MongoFactory.conn.close()
@staticmethod
def closeDB():
MongoFactory.db.close()
@staticmethod
def getDBCollection(tableName):
return MongoFactory.db[tableName]
三,接下来在entity包中创建一个实体ProvinceCity.py,代码如下:
# -*- coding: utf-8 -*-
class ProvinceCity:
def __init__(self,cityName, cityPinyin,provinceName,provincePinyin):
self.cityName = cityName
self.cityPinyin = cityPinyin
self.provinceName = provinceName
self.provincePinyin = provincePinyin
self.homeinnsCityid = None
def setCityName(self, cityName):
self.cityName = cityName
def getCityName(self):
return self.cityName
def setCityPinyin(self,cityPinyin):
self.cityPinyin = cityPinyin
def getCityPinyin(self):
return self.cityPinyin
def setProvinceName(self,provinceName):
self.provinceName = provinceName
def getProvinceName(self):
return self.provinceName
def setProvincePinyin(self, provincePinyin):
self.provincePinyin = provincePinyin
def getProvincePinyin(self):
return self.provincePinyin
def setHomeinnsCityid(self,homeinnsCityid):
self.homeinnsCityid = homeinnsCityid
def getHomeinnsCityid(self):
return self.homeinnsCityid
四,在util的__init__.py文件中放入如下代码:
# -*- coding:utf-8 -*-
from datetime import datetime
class Utils:
cityCache = []
"""
增加month
"""
@staticmethod
def getLastDate(incr):
dt = datetime.now()
y = int(dt.strftime('%Y'))
m = int(dt.strftime('%m'))
d = int(dt.strftime('%d'))
if m + incr > 12:
y = y + 1
m = m + incr - 12
d1 = datetime(y, m, d)
y = int(d1.strftime('%Y'))
m = int(d1.strftime('%m'))
d = int(d1.strftime('%d'))
tsm = None
if m < 10:
tsm = ('0' + str(m))
else:
tsm = str(m)
tsd =None
if d < 10:
tsd = ('0' + str(m))
else:
tsd = str(d)
nd = str(y)+"-"+str(tsm)+"-"+str(tsd)
return nd
@staticmethod
def getDate(incr):
dt = datetime.now()
y = int(dt.strftime('%Y'))
m = int(dt.strftime('%m'))
d = int(dt.strftime('%d'))
tsm = None
if m < 10:
tsm = ('0' + str(m))
else:
tsm = str(m)
tsd =None
if d < 10:
tsd = ('0' + str(d))
else:
tsd = str(d)
nd = str(y)+"-"+str(tsm)+"-"+str(tsd)
if m in [1,3,5,7,8,12]:
if (d + incr) > 31:
m += 1
d = (d + incr) - 31
else:
d += incr
elif m in [4,6,9,10,11]:
if (d + incr) > 30:
m += 1
d = (d + incr) - 30
else:
d += incr
else:
if Utils.isLeap(y):
if (d + incr) > 29:
m += 1
d = (d + incr) - 29
else:
d += incr
else:
if (d + incr) > 28:
m += 1
d = (d + incr) - 28
else:
d += incr
sm = None
if m < 10:
sm = ('0' + str(m))
else:
sm = str(m)
sd =None
if d < 10:
sd = ('0' + str(d))
else:
sd = str(d)
td = str(y) + "-" + str(sm) + "-" + str(sd)
return (nd,td)
@staticmethod
def isLeap(year):
if year%400 == 0:
return True
elif year%4 == 0 and year%100 == 0:
return True
五,在init包中创建InitConfig.py文件,代码如下:
# -*- coding:utf-8 -*-
#--------------------------------------------
#描述:如家页面爬虫主程序
#日期: 2014-02-09
#作者: jiangfuqiang
#---------------------------
from com.jhnet.spider.db.factory.MongoFactory import MongoFactory
from com.jhnet.spider.entity.ProvinceCity import ProvinceCity
from com.jhnet.spider.fetchpage.spider import SpiderUtils
from com.jhnet.spider.util import Utils
from xml.dom import minidom
import os
mongoConfigDict = {};
path = "E:"+os.sep+"testdata" + os.sep #写一个你自己的存放地址
#初始化mongodb的配置,放入字典中
def initMongoXml():
xmldoc = minidom.parse("../../../resources/mongodb-config.xml")
mongos = xmldoc.firstChild
mongoChild = mongos.childNodes
for i in range(0,mongoChild.length):
mongo = mongoChild.item(i)
if mongo.nodeType == 1: #去掉标签
mongoName = mongo.nodeName
mChild = mongo.childNodes
for j in range(0, mChild.length):
mm = mChild.item(j)
if mm.nodeType == 1:
mmName = mm.nodeName
mmValue = mm.childNodes.item(0).nodeValue
mongoConfigDict[mongoName + '_' + mmName] = mmValue
initMongoXml() #初始化mongodb配置
mf = MongoFactory()
mf.initMongo(mongoConfigDict)
#缓存城市
def cacheCity():
citys = MongoFactory.getDBCollection("province_city").find()
if citys.count() > 0:
for city in citys:
cityName = city['city_name']
cityNamePinyin = city['city_pinyin']
provincePinyin = city['province_pinyin']
provinceName = city['province_name']
pc = ProvinceCity(cityName, cityNamePinyin,provinceName,provincePinyin)
if 'homeinns_cityid' in city.keys():
pc.setHomeinnsCityid(city['homeinns_cityid'])
Utils.cityCache.append(pc)
cacheCity()
#在这里你就可以执行recordPC.py了,赶紧先去初始化吧
好了,今晚就先上这么多,明晚继续