最近想试试python的爬虫库,就找了个只有字符串的的网页来爬取。网址如下:
http://mobilecdn.kugou.com/api/v3/special/song?plat=0&page=1&pagesize=-1&version=7993&with_res_tag=1&specialid=26430
打开后看到是一些歌名还有hash等信息。按照hash|filename的方式存在文件里,先贴代码#coding=utf-8
import urllib
import re
import os
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
def getHash(html):
reg = r'"hash":"(.+?)",'
has = re.compile(reg)
hashlist = re.findall(has,html)
with open('1.txt','w') as f:
for has in hashlist:
f.write(has+"|"+"\r\n")
def getName(html):
reg=r'"filename":"(.+?)",'
name=re.compile(reg)
namelist=re.findall(name,html)
with open('1.txt','rb') as fr:
with open('2.txt','wb') as fw:
for name in namelist:
for l in fr:
fw.write(l.replace(b'\r\n', name+b'\r\n'))
break
html=getHtml("http://mobilecdn.kugou.com/api/v3/special/song?plat=0&page=1&pagesize=-1&version=7993&with_res_tag=1&specialid=26430")
getHash(html)
getName(html)
os.remove('1.txt')