python爬虫 获取小米应用商店app信息

接着上一篇文章,这次爬取小米app的数据。

python爬虫 获取小米应用商店app信息_第1张图片
主要是爬取应用和游戏这两类的app数据

import requests
from lxml import etree
import re
import datetime

url_2 = "http://app.mi.com/"


def fun(url, page1, pageId1):
    data = requests.get(url).text
    # 去除“”
    a = re.sub('"', '', data)
    # 去除 :
    b = re.sub(':', '', a)
    # 去除 ,
    c = re.sub(',', '', b)
    d = re.sub('{', '', c)
    e = re.sub('}', '', d)
    strId = re.findall(r"packageName(.+?)appId", e)

    leng = len(strId)
    if leng == 0:
        print(leng)
        return

    for a in strId:
        nstr = "http://app.mi.com/details?id=" + a
        data2 = requests.get(nstr).text
        s = etree.HTML(data2)
        try:
            name = s.xpath('/html/body/div[4]/div[1]/div[2]/div[1]/div/h3/text()')[0]
            ntype = s.xpath('/html/body/div[4]/div[1]/div[2]/div[1]/div/p[2]/text()[1]')[0]
            company = s.xpath('/html/body/div[4]/div[1]/div[2]/div[1]/div/p[1]/text()')[0]
            if ',' in company:
                company = company.replace(',', ' ')
            size = s.xpath('/html/body/div[4]/div[1]/div[2]/div[2]/div/ul[1]/li[2]/text()')[0]
            if ',' in size:
                size = size.replace(',', '')
            version = s.xpath('/html/body/div[4]/div[1]/div[2]/div[2]/div/ul[1]/li[4]/text()')[0]
            if ',' in version:
                version = version.replace(',', '')
            updateTime = s.xpath('/html/body/div[4]/div[1]/div[2]/div[2]/div/ul[1]/li[6]/text()')[0]
            intrdouct = s.xpath('/html/body/div[4]/div[1]/div[4]/p/text()')
            merge = ''
            merge = merge.join(intrdouct)
            merge = merge.strip()
            if ',' in merge:
                merge = merge.replace(',', ' ')
            # 去除字符串中的换行符 CR
            merge = merge.replace('\r', '')

            score = s.xpath('/html/body/div[4]/div[1]/div[2]/div[1]/div/span/text()')[0]
            picture = s.xpath('/html/body/div[4]/div[1]/div[2]/div[1]/img/@src')[0]
            download = s.xpath('/html/body/div[4]/div[1]/div[2]/div[1]/div/div[2]/a/@href')[0]
            downloadaddr = 'http://app.mi.com' + download
            currTime = datetime.datetime.now()
            f.write('{},{},{},{},{},{},{},{},{},{},{},{}\n'.format(name, ntype, company, size, version,
                                                                   updateTime, score, ' ', merge,
                                                                   picture, downloadaddr, currTime))
        # 出现异常跳出,防止程序崩溃
        except IndexError:
            pass
        print("{}  {}".format(pageId1, page1))


with open('D:/software_file/pythonFile/xiaomi_2.csv', 'w', encoding='gb18030') as f:
    f.write(
        "{},{},{},{},{},{},{},{},{},{},{},{}\n".format('应用名称', '应用类型', '公司名称', 'app大小', '版本号',
                                                       '更新时间', '评分人数', '下载人数',
                                                       '应用介绍', '图标', '下载地址', '爬取时间'))

    for pageId in range(1, 16):
        for page in range(67):
            url2 = "http://app.mi.com/categotyAllListApi?page={}&categoryId={}&pageSize=30".format(page, pageId)
            fun(url2, page, pageId)

    # 读取pageId为27的网页
    # #(67):
    for page in range(67):
        url2 = "http://app.mi.com/categotyAllListApi?page={}&categoryId={}&pageSize=30".format(page, 27)
        fun(url2, page, pageId)

你可能感兴趣的:(python爬虫 获取小米应用商店app信息)