最近和朋友一起开发APP,需要大量数据,而"互联网"与"共享"融合发展的理念,遂资源的可重用给予了当代骚客文人获得感与幸福感…好了,不日白了(正宗重庆话,吹牛的意思),开始正题
本人是做JavaWeb的,可能多多少少还是遗留了Java的一些格式及规范,但爬虫千千万,却是Python最好使
作为爬虫(非用户操作),是一定要学会伪装自己,不然仅仅是写代码测试的时候,多搞几下,人家网站的反爬措施就会冻结你一段时间,那就直接玩鸟了。
# 较常见的headers
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
}
# 或者你心情好,也可以搞个完善的headers,like this
headers = {
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.8',
'Cache-Control': 'max-age=0',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
'Connection': 'keep-alive',
'Referer': 'http://www.baidu.com/'
}
proxy = {'http':'106.46.136.112:808'}
import time
time.sleep(3000)
在此要感谢我的朋友TH,起初我只知道MongoDB这几个英文字母,是他作为引路人和实践者,让我认知到又一门技术,非常感谢你的引导!
# -*- coding:utf-8 -*-
import json
import re
import time
import random
import requests
import pymongo
import bson.binary
from selenium import webdriver
from bs4 import BeautifulSoup
BASE_URL = "https://music.xxx.com"
HEADER = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
class GetData():
def __init__(self):
# 建立MongoDB数据库连接
client = pymongo.MongoClient('192.168.31.68', 27017)
# 连接所需数据库
db = client["song"]
# 连接所需集合
self.lyric = db["lyric"]
self.picture = db["picture"]
@staticmethod
def get_driver():
""" 获取Chrome driver """
# 进入阅览器
options = webdriver.ChromeOptions()
# 设置header
options.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"')
# 加载chromedriver.exe
chrome_driver = r"D:\myProject\worker\venv\Lib\site-packages\selenium\webdriver\chrome\chromedriver.exe"
driver = webdriver.Chrome(executable_path=chrome_driver, chrome_options=options)
return driver
@staticmethod
def get_chrome(driver, url):
""" 获得HTML页面内容 """
time.sleep(random.randint(10, 30)) # 主线程随机暂停 10至30秒
driver.get(url)
iframe = driver.find_elements_by_tag_name('iframe')[0]
driver.switch_to.frame(iframe)
return BeautifulSoup(driver.page_source, "lxml")
@staticmethod
def get_request(url):
""" 封装request的GET请求 """
time.sleep(random.randint(10, 30)) # 主线程随机暂停 10至30秒
data = requests.get(url, headers=HEADER)
return data
@staticmethod
def get_lrc(song_id):
""" 解析拼装歌词 """
lrc_url = 'http://music.xxx.com/api/song/lyric?' + 'id=' + str(song_id) + '&lv=1&kv=1&tv=-1'
html = GetData.get_request(lrc_url).text
# 转换为json
j = json.loads(html)
# 英文歌词
lrc = j['lrc']['lyric']
# 中文歌词
cn_lrc = j['tlyric']['lyric']
# 正则去掉[]及里面内容
pat = re.compile(r'\[.*\]')
lrc = re.sub(pat, "", lrc)
cn_lrc = re.sub(pat, "", cn_lrc)
# 获得英文和中文歌词
lrc = lrc.strip()
cn_lrc = cn_lrc.strip()
a = lrc.split("\n")
b = cn_lrc.split("\n")
result = ""
for index_en in range(len(a)):
for index_cn in range(len(b)):
if index_en == index_cn:
result += a[index_en] + "\n" + b[index_en] + "\n"
return result
def get_netease_cloud(self, count):
driver = GetData.get_driver()
# 只搜索类型为欧美的歌曲
url = BASE_URL + "/#/discover/playlist/?order=hot&cat=欧美&limit=35&offset=" + str((0 + count) * 35)
soup = GetData.get_chrome(driver, url)
# img_html = soup.find_all('img', class_=re.compile('j-flag'))
# img_data = GetData.get_request(img_html[count]['src'])
# 过滤class为msk的a标签,检索出每张专辑ID及访问路径
msg = soup.find_all('a', class_=re.compile('msk'))
for playlist in msg:
""" 单张专辑的歌曲目录 """
url = BASE_URL + playlist['href']
soup = GetData.get_chrome(driver, url)
# 获取table列表中播放按钮的ID值
play = soup.find_all('span', class_=re.compile('ply'))
for play_id in play:
try:
url = BASE_URL + "/song?id=" + play_id['data-res-id']
soup = GetData.get_chrome(driver, url)
# 获取图片
song_img = soup.find_all('img', class_=re.compile('j-img'))
song_img_data = GetData.get_request(song_img[0]['data-src']).content
# 歌名
song_name = soup.find_all('em', class_=re.compile('f-ff2'))
# 歌手名与所属专辑
singer, album = soup.find_all('p', class_=re.compile('des s-fc4'))
pat = re.compile('>(.*?)<')
pat_song_name = ''.join(pat.findall(str(song_name[0])))
pat_singer = ''.join(pat.findall(str(singer)))
pat_album = ''.join(pat.findall(str(album)))
result = "歌名:" + pat_song_name + "\n" + pat_singer + "\n" + pat_album + "\n" + GetData.get_lrc(
play_id['data-res-id'])
self.picture.save(dict(
song_name=pat_song_name,
img=bson.binary.Binary(song_img_data)
))
self.lyric.save(dict(
song_name=pat_song_name,
singer=pat_singer,
album=pat_album,
lyric=result
))
except Exception as e:
print(e)
pass
continue
driver.quit()
if __name__ == '__main__':
count = 0
lic = GetData()
while(count < 38):
lic.get_netease_cloud(count)
count += 1