用爬虫解决生活问题

    前几天有一个妹子让我帮她写一个爬虫。问题背景是有几个产品,每个产品对应一个excel。我要实现的功能是读取产品相关网页上该产品的净值,然后写到Execl中去。问题很简单无非就是正则表达式的读取excel,不过有一点需要注意的是现有的几个python Excel插件功能都不是完善,有的只能读excel不能写,有的能写,能读,但是不能修改(可以理解对文件进行追加操作)。不过有一个最强大但是略为繁琐的工具是PyCom。利用COM接口,几乎可以做到任何你想坐的事,就是有点麻烦。其实也不麻烦啦,MSDN有现成的文档,查查文档就能解决。我在这里直接写代码了,因为内容没有什么可以讲的。
    唯一值得讲的是,我给妹子写完代码,妹子就再也不理我了。。=-=

import re
import sys
import urllib.request
import io
import win32com.client
import os

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')  # 改变标准输出的默认编码

class NetWorth:
    __product = [ ]
    __simplename = [ ]
    __map = { }
    __worthmap = { }
    __pagelist = ['http://www.xinhu.cn/cfzx.html?action=list-more&page=1','http://www.xinhu.cn/cfzx.html?action=list-more&page=2','http://www.xinhu.cn/cfzx.html?action=list-more&page=3']

    def getproductlist(self, filename):
        fo = open(filename,'r',encoding='utf-8')
        self.__product = fo.readlines();
        index = 0;
        for it in self.__product:
            self.__product[index] = it.strip('\n')
            index += 1

    def jxname(self,filename):
        fo = open(filename, 'r', encoding='utf-8')
        self.__simplename = fo.readlines();
        index = 0;
        for it in self.__simplename:
            p = it.strip('\n')
            self.__simplename[index] = p.strip(' ')
            index += 1

    def getproducturl(self):
        for each in self.__pagelist:
            index = 0;
            while index < len(self.__product):
                patternurl = self.__product[index] + '(" href=")(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)'
                pagecon = urllib.request.urlopen(each).read().decode('utf-8')
                searchobj = re.search(patternurl, pagecon)
                if searchobj:
                    self.__map[self.__product[index]] = searchobj.group(2)
                    del self.__product[index]
                else:
                    index += 1

    def removeirre(self, worthlist_):
        del worthlist_[0]
        del worthlist_[5]
        del worthlist_[5]
        return worthlist_

    def getnetworth(self):
        patternnetworth = r'(\d+\.\d{3})(?!%)'
        for it in self.__map:
            pagecon = urllib.request.urlopen(self.__map[it]).read().decode('utf-8')
            searchobj = re.findall(patternnetworth, pagecon)
            if searchobj:
                searchobj = self.removeirre(searchobj)
                self.__worthmap[it] = searchobj

    def writetoexcel(self,it,li):
        filename = os.getcwd() + '/' + it + '.xlsx'
        li.append(li[4])
        li.append(li[4])
        li.reverse()
        xlapp = win32com.client.Dispatch("Excel.Application")
        xlWorkBook = xlapp.Workbooks.Open(filename)
        xlWorkSheet = xlWorkBook.WorkSheets(it)
        flag = 0
        for k in range(30,xlWorkSheet.UsedRange.Rows.Count):
            if xlWorkSheet.Cells(k,2).value == None:
                flag = k
                break

        for k in li:
            xlWorkSheet.Cells(flag,2).value = k
            flag += 1
        xlWorkBook.Close(SaveChanges=1)

    def findexcel(self):
        for it in self.__simplename:
            for k in self.__worthmap:
                searchobj = re.findall(it,k)
                if searchobj:
                    self.writetoexcel(it,self.__worthmap[k])
                    break;


    def printworthmap(self):
        print('The len:\t',len(self.__worthmap))
        for it in self.__worthmap:
            print(it, self.__worthmap[it])

    def printproduct(self):
        for it in self.__product:
            print(it)

    def printjx(self):
        for it in self.__simplename:
            print(it)

nty = NetWorth()
nty.getproductlist('product.txt')
nty.getproducturl()
nty.getnetworth()
nty.printworthmap()

nty.jxname('logogram.txt')
nty.printjx()
nty.findexcel()

你可能感兴趣的:(Python,python,爬虫)