需要获得的数据有歌手,歌名,加上连接
歌词单独保存到txt文件中,用歌手加歌名的文件夹分类保存
'''在所需的文件夹中打开命令窗口'''
scrapy startproject singer
import scrapy
from scrapy import Item,Field
class SingerItem(scrapy.Item):
singer = Field()
songname = Field()
href = Field()
pass
#-*-coding:utf-8-*-
from scrapy.spiders import CrawlSpider
from singer.items import SingerItem
from selenium import webdriver
import time
import os
import re
class singer(CrawlSpider):
name = "singer"
start_urls = ['https://music.163.com/#/discover/toplist']
driver = webdriver.Chrome()
driver.get(start_urls[0])
# myiframe = driver.find_element_by_id('g_iframe')
driver.switch_to.frame('contentFrame') #获取嵌入的网页
time.sleep(1)
def parse(self, response):
item = SingerItem()
for i in range(100):
#定位tr标签
trs = self.driver.find_element_by_id('song-list-pre-cache').\
find_elements_by_tag_name('table')[0]. \
find_elements_by_tag_name('tbody')[0].\
find_elements_by_tag_name('tr')
#歌手
singer2 = trs[i].find_elements_by_tag_name('td')[3].\
find_elements_by_tag_name('div')[0].get_attribute('title')
# 歌名
songname = trs[i].find_elements_by_tag_name('td')[1].\
find_element_by_class_name('ttc').\
find_element_by_tag_name('b').get_attribute('title')
#连接,后期用来进入歌词页面
songimg = trs[i].find_elements_by_tag_name('td')[1].\
find_element_by_tag_name('a').get_attribute('href')
print(singer2, songname, songimg)
self.driver.get(songimg) ##获取连接,进入
myiframe = self.driver.find_element_by_id('g_iframe')
self.driver.switch_to.frame(myiframe) # 进去嵌套式网页
time.sleep(1)
if self.driver.find_element_by_id('flag_ctrl')==True:
self.driver.find_element_by_id('flag_ctrl').click()
# 点展开的按钮 ,否则,不会获取全部的歌词
text = self.driver.find_element_by_id('lyric-content').text.strip() # 歌词
print(text)
time.sleep(1)
else:
text = "无歌词" # 因为排名的是动态的,有时候就是没有歌词
time.sleep(1)
print(text)
###############保存的路径根据自己的需求设定
if os.path.exists("music"):
os.chdir("D:\pycharm2017.3.2\work\pachong"
"\scrapy06-10\home\singer\singer\music")
if singer2.find("/"):
singer3 = singer2.replace("/","+") #路径问题,"/"不识别,所以我替换掉了
f = open("D:\pycharm2017.3.2\work\pachong"
"\scrapy06-10\home\singer\singer"
"\music\ "+singer3+","+songname + ".txt",'w')
# 以追加的方式
f.write(text)
f.close()
else:
f = open("D:\pycharm2017.3.2\work\pachong"
"\scrapy06-10\home\singer\singer\music"
"\ " + singer2 + "," + songname + ".txt",'w')
# 以追加的方式
f.write(text)
f.close()
os.chdir("D:\pycharm2017.3.2\work\pachong"
"\scrapy06-10\home\singer\singer")
else:
os.mkdir("D:\pycharm2017.3.2\work\pachong"
"\scrapy06-10\home\singer\singer\music")
os.chdir("D:\pycharm2017.3.2\work\pachong"
"\scrapy06-10\home\singer\singer\music")
f = open("D:\pycharm2017.3.2\work\pachong"
"\scrapy06-10\home\singer\singer\music\ "+singer2+","+songname + ".txt",'w')
f.write(text)
f.close()
os.chdir("D:\pycharm2017.3.2\work\pachong\scrapy06-10\home\singer\singer")
print("-----------------------------------")
self.driver.back() # 返回排行页面
print("44444444444444444444444444444444444444444444444")
####### 返回前页的时候,出来的不是初始的页面,需要重新获取嵌入式的网页内容
myiframe = self.driver.find_element_by_id('g_iframe')
self.driver.switch_to.frame(myiframe)
#进入返回后的嵌套式网页
item["href"] = songimg # 图片链接
item["singer"] = singer2 # 歌手
item["songname"] = songname # 歌名
yield item
time.sleep(1)
print("----------------------正在爬第"+str(i+1)+"页")
####本人没有在初始状态下修改,而是直接新建了一个保存的类,具体写法如下
import xlwt
class ExcelSingerPipeline(object):
index = 0
def __init__(self):
self.wb = xlwt.Workbook()
self.sheet = self.wb.add_sheet("music")
list2 = ["链接","歌手","歌名"]
for i,v in enumerate(list2):
self.sheet.write(ExcelSingerPipeline.index,i,v)
def process_item(self,item,spider):
ExcelSingerPipeline.index += 1
for i,v in enumerate(item.keys()):
self.sheet.write(ExcelSingerPipeline.index,i,item[v])
return item
def close_spider(self,spider):
self.wb.save("music.xls")
'''添加头部代理,可用列表装多个,随机选择
这里的在setting文件中设置的默认存储文件不适应
FEED_URI=u'file:///D:/pycharm2017.3.2/work/scrapy 0608/doubanTU/douban.csv'
FEED_FORMAT = 'CSV'
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
'''
#####找到ITEM_PIPELINES,并激活,改修自定义的类
ITEM_PIPELINES = {
# 'singer.pipelines.SingerPipeline': 300,
'singer.pipelines.ExcelSingerPipeline': 300,
}
#-*-coding:utf-8-*-
from scrapy import cmdline
cmdline.execute("scrapy crawl singer".split())
本人持续更新中,欢迎关注呦~