本人是无房一族,所以,比较关心房价,特别是最近房价又涨了。于是想自己分析一下存量、销量数据。没办法,自己爬下来看看吧。北京房地产交易管理网:http://www.bjfdc.gov.cn/
1.爬取链接
http://210.75.213.164/public/statistic/popup_everyday_all.asp?date_statistic=2006-01-01
日期可以是2005年某天到现在的任何一天
2.分析页面
没做很复杂的,就是找“可售期房”,“可售现房”,“期房签约”,“现房签约”中的“住宅套数”,其他的我不关心。
3.按照月、日存入Excel中
如果想自己分析,可以再Excel中画图来分析。
#-*- encoding:gb2312 -*- import urllib import re import datetime import time import os import threading urlPat = "http://210.75.213.164/public/statistic/popup_everyday_all.asp?date_statistic=%s" def getNum(data,index): line = getLine(data,index) s1 = re.findall('>/d+<',line) s2 = re.findall('/d+',str(s1)) return s2[0] def getLine(data,index): line = data[index] line = line#.decode('gb2312')#.encode('utf-8') return line def findLine(data,index,s): line = getLine(data,index) if line.find('<td') >0 and line.find(s) > 0: return True else: return False #return li def getData(data,begin,key1,key2): li = list() for i in range(begin,len(data)): if findLine(data,i,key1) : #print i,getLine(data,i) while True: i = i + 1 if findLine(data,i,key2) : #print i,getLine(data,i) i = i + 1 data = getNum(data,i) li.append(i+1) li.append(data) return li #如果没有找到返回0 li.append(begin) li.append(0) return li def down(date): url = urlPat % date #print url while True: try: data = urllib.urlopen(url).readlines() break; except: print 'Time out' time.sleep(1) #print 'begin extract' key = '住宅套数' #print 'ke shou qi fang' index,ksqf = getData(data,0,'可售期房统计',key) #print 'qi fang wang shang qian yue' index,qyqf = getData(data,index,'期房网上签约',key) #print 'wei qian yue xian fang' index,ksxf = getData(data,index,'未签约现房统计',key) #print 'xian fang wang shang qian yue' index,qyxf = getData(data,index,'现房网上签约',key) li = (ksqf,ksxf,qyqf,qyxf) print date,li return li from win32com.client import constants, Dispatch class EasyExcel: def __getDateList(self): beginDate = datetime.date(2004,1,1) for i in range(0,366): dt = datetime.timedelta(days = i) d = beginDate + dt self.dateList.append(d) def __row(self,date): for i in range(0,366): d = self.dateList[i] if d.month == date.month and d.day == date.day: return i + 3 def __column(self,date): return (date.year - 2006)*4 + 2 def __init__(self, filename,sheet): self.xlApp = Dispatch('Excel.Application') self.filename = filename self.xlBook = self.xlApp.Workbooks.Open(filename) self.xlSheet = self.xlBook.Worksheets(sheet) self.dateList = list() self.__getDateList() def save(self): self.xlBook.Save() def close(self): self.xlBook.Close(SaveChanges=1) del self.xlApp def getCell(self, row, col): return self.xlSheet.Cells(row, col).Value def setCell(self,row,col,value): self.xlSheet.Cells(row,col).Value = value def setCells(self,date,li): row = self.__row(date) col = self.__column(date) for i in range(0,4): self.setCell(row,col+i,li[i]) def getStartDate(excel): dateValue = excel.getCell(1,2) print dateValue arr = dateValue.split('-') return datetime.date(int(arr[0]),int(arr[1]),int(arr[2])) def setStartDate(excel,date): excel.setCell(1,2,str(date)) def download(): excel = EasyExcel('d:/test.xlsx','Data') beginDate = getStartDate(excel) date = beginDate while date < datetime.date.today(): li = down(date) excel.setCells(date,li) setStartDate(excel,date) excel.save() dt = datetime.timedelta(days = 1) date = date + dt #break; excel.close() import sys if __name__ == '__main__' : download()