接着上一篇文章,这次爬取小米app的数据。
import requests
from lxml import etree
import re
import datetime
url_2 = "http://app.mi.com/"
def fun(url, page1, pageId1):
data = requests.get(url).text
# 去除“”
a = re.sub('"', '', data)
# 去除 :
b = re.sub(':', '', a)
# 去除 ,
c = re.sub(',', '', b)
d = re.sub('{', '', c)
e = re.sub('}', '', d)
strId = re.findall(r"packageName(.+?)appId", e)
leng = len(strId)
if leng == 0:
print(leng)
return
for a in strId:
nstr = "http://app.mi.com/details?id=" + a
data2 = requests.get(nstr).text
s = etree.HTML(data2)
try:
name = s.xpath('/html/body/div[4]/div[1]/div[2]/div[1]/div/h3/text()')[0]
ntype = s.xpath('/html/body/div[4]/div[1]/div[2]/div[1]/div/p[2]/text()[1]')[0]
company = s.xpath('/html/body/div[4]/div[1]/div[2]/div[1]/div/p[1]/text()')[0]
if ',' in company:
company = company.replace(',', ' ')
size = s.xpath('/html/body/div[4]/div[1]/div[2]/div[2]/div/ul[1]/li[2]/text()')[0]
if ',' in size:
size = size.replace(',', '')
version = s.xpath('/html/body/div[4]/div[1]/div[2]/div[2]/div/ul[1]/li[4]/text()')[0]
if ',' in version:
version = version.replace(',', '')
updateTime = s.xpath('/html/body/div[4]/div[1]/div[2]/div[2]/div/ul[1]/li[6]/text()')[0]
intrdouct = s.xpath('/html/body/div[4]/div[1]/div[4]/p/text()')
merge = ''
merge = merge.join(intrdouct)
merge = merge.strip()
if ',' in merge:
merge = merge.replace(',', ' ')
# 去除字符串中的换行符 CR
merge = merge.replace('\r', '')
score = s.xpath('/html/body/div[4]/div[1]/div[2]/div[1]/div/span/text()')[0]
picture = s.xpath('/html/body/div[4]/div[1]/div[2]/div[1]/img/@src')[0]
download = s.xpath('/html/body/div[4]/div[1]/div[2]/div[1]/div/div[2]/a/@href')[0]
downloadaddr = 'http://app.mi.com' + download
currTime = datetime.datetime.now()
f.write('{},{},{},{},{},{},{},{},{},{},{},{}\n'.format(name, ntype, company, size, version,
updateTime, score, ' ', merge,
picture, downloadaddr, currTime))
# 出现异常跳出,防止程序崩溃
except IndexError:
pass
print("{} {}".format(pageId1, page1))
with open('D:/software_file/pythonFile/xiaomi_2.csv', 'w', encoding='gb18030') as f:
f.write(
"{},{},{},{},{},{},{},{},{},{},{},{}\n".format('应用名称', '应用类型', '公司名称', 'app大小', '版本号',
'更新时间', '评分人数', '下载人数',
'应用介绍', '图标', '下载地址', '爬取时间'))
for pageId in range(1, 16):
for page in range(67):
url2 = "http://app.mi.com/categotyAllListApi?page={}&categoryId={}&pageSize=30".format(page, pageId)
fun(url2, page, pageId)
# 读取pageId为27的网页
# #(67):
for page in range(67):
url2 = "http://app.mi.com/categotyAllListApi?page={}&categoryId={}&pageSize=30".format(page, 27)
fun(url2, page, pageId)