从China Webmaster上收集房地产网站排名、网址和简介

    今天CX姐要求我从China Webmaster上收集排名前200名的房地产网站的排名、网址和简介以及评分信息。在完成任务之后我把程序重新优化了一下,写成了一个类,方便下次重复使用。最终的返回结果是一个pandas中的dataframe,可以写成excel。

class Rank:
    def __init__(self,start,end):
        from bs4 import BeautifulSoup 
        from urllib import request
        import re
        import pandas as pd
        import numpy as np
        import urllib.parse as urp
        import time
        import urllib.request as urr
        self.limit = end
        self.begin = start
    def __upbond__(self):
        if self.limit - (self.limit // 30)*30 >= 0:
            return (self.limit // 30)+1

    def __func1__(self,r1):
        return r1.find('a').get_text()
    def __func2__(self,r1):
        return r1.find('span').get_text()
    def __func3__(self,r2):
        return r2.find('strong').get_text()
    def __func4__(self,r2):
        return re.findall(r'\d?\d?\d?\d?',r2.find('span').get_text())[0]
    def __func5__(self,r1):
        return 'http://top.chinaz.com'+r1.find('a').get('href')
    def __func6__(self,url):
        req3 = request.urlopen(url)
        res3 = req3.read()
        bs3 = BeautifulSoup(res3,'lxml')
        return bs3.find_all('p',attrs={'class':'webIntro'})[0].get_text()
    def __scripy1__(self,num):
        if num == 1:
            url1 = 'http://top.chinaz.com/hangye/index_shenghuo_fangchang_alexa.html'
            return self.__scrapy2__(url1)
        if num > 1:
            url1 = 'http://top.chinaz.com/hangye/index_shenghuo_fangchang_alexa_'+str(num)+'.html'
            return self.__scrapy2__(url1)
    def __scrapy2__(self,url1):
            req1 = request.urlopen(url1)
            res1 = req1.read()
            bs1 = BeautifulSoup(res1,'lxml')
            result1 = bs1.find_all('div',attrs={'class':'CentTxt'} )
            result2 = bs1.find_all('div',attrs = {'class':'RtCRateCent'})
            arr1 = pd.Series(result1)
            arr2 = pd.Series(result2)
            df = pd.DataFrame([],columns=['name','web','rank','score','web2'])
            df['name'] = arr1.apply(self.__func1__)

            df['web'] =  arr1.apply(self.__func2__)

            df['rank'] = arr2.apply(self.__func3__)

            df['score'] = arr2.apply(self.__func4__)

            df['web2'] = arr1.apply(self.__func5__)
            return df
    def __scrapy3__(self):
        final_df = pd.DataFrame([],columns=['name','web','rank','score','web2'])
        start_page = self.begin // 30
        for i in np.arange(start_page,self.__upbond__()+1):
            final_df = pd.concat([final_df,self.__scripy1__(i)],ignore_index=True)
        return final_df
    def scrapy(self):
        df = self.__scrapy3__()
        df['intro'] = np.nan
        df['intro'] = df['web2'].apply(self.__func6__)
        return df

你可能感兴趣的:(从China Webmaster上收集房地产网站排名、网址和简介)