前几天有一个妹子让我帮她写一个爬虫。问题背景是有几个产品,每个产品对应一个excel。我要实现的功能是读取产品相关网页上该产品的净值,然后写到Execl中去。问题很简单无非就是正则表达式的读取excel,不过有一点需要注意的是现有的几个python Excel插件功能都不是完善,有的只能读excel不能写,有的能写,能读,但是不能修改(可以理解对文件进行追加操作)。不过有一个最强大但是略为繁琐的工具是PyCom。利用COM接口,几乎可以做到任何你想坐的事,就是有点麻烦。其实也不麻烦啦,MSDN有现成的文档,查查文档就能解决。我在这里直接写代码了,因为内容没有什么可以讲的。
唯一值得讲的是,我给妹子写完代码,妹子就再也不理我了。。=-=
import re
import sys
import urllib.request
import io
import win32com.client
import os
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') # 改变标准输出的默认编码
class NetWorth:
__product = [ ]
__simplename = [ ]
__map = { }
__worthmap = { }
__pagelist = ['http://www.xinhu.cn/cfzx.html?action=list-more&page=1','http://www.xinhu.cn/cfzx.html?action=list-more&page=2','http://www.xinhu.cn/cfzx.html?action=list-more&page=3']
def getproductlist(self, filename):
fo = open(filename,'r',encoding='utf-8')
self.__product = fo.readlines();
index = 0;
for it in self.__product:
self.__product[index] = it.strip('\n')
index += 1
def jxname(self,filename):
fo = open(filename, 'r', encoding='utf-8')
self.__simplename = fo.readlines();
index = 0;
for it in self.__simplename:
p = it.strip('\n')
self.__simplename[index] = p.strip(' ')
index += 1
def getproducturl(self):
for each in self.__pagelist:
index = 0;
while index < len(self.__product):
patternurl = self.__product[index] + '(" href=")(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)'
pagecon = urllib.request.urlopen(each).read().decode('utf-8')
searchobj = re.search(patternurl, pagecon)
if searchobj:
self.__map[self.__product[index]] = searchobj.group(2)
del self.__product[index]
else:
index += 1
def removeirre(self, worthlist_):
del worthlist_[0]
del worthlist_[5]
del worthlist_[5]
return worthlist_
def getnetworth(self):
patternnetworth = r'(\d+\.\d{3})(?!%)'
for it in self.__map:
pagecon = urllib.request.urlopen(self.__map[it]).read().decode('utf-8')
searchobj = re.findall(patternnetworth, pagecon)
if searchobj:
searchobj = self.removeirre(searchobj)
self.__worthmap[it] = searchobj
def writetoexcel(self,it,li):
filename = os.getcwd() + '/' + it + '.xlsx'
li.append(li[4])
li.append(li[4])
li.reverse()
xlapp = win32com.client.Dispatch("Excel.Application")
xlWorkBook = xlapp.Workbooks.Open(filename)
xlWorkSheet = xlWorkBook.WorkSheets(it)
flag = 0
for k in range(30,xlWorkSheet.UsedRange.Rows.Count):
if xlWorkSheet.Cells(k,2).value == None:
flag = k
break
for k in li:
xlWorkSheet.Cells(flag,2).value = k
flag += 1
xlWorkBook.Close(SaveChanges=1)
def findexcel(self):
for it in self.__simplename:
for k in self.__worthmap:
searchobj = re.findall(it,k)
if searchobj:
self.writetoexcel(it,self.__worthmap[k])
break;
def printworthmap(self):
print('The len:\t',len(self.__worthmap))
for it in self.__worthmap:
print(it, self.__worthmap[it])
def printproduct(self):
for it in self.__product:
print(it)
def printjx(self):
for it in self.__simplename:
print(it)
nty = NetWorth()
nty.getproductlist('product.txt')
nty.getproducturl()
nty.getnetworth()
nty.printworthmap()
nty.jxname('logogram.txt')
nty.printjx()
nty.findexcel()