爬虫的代码在 Free-Midi-Library/src/midi_scratch.py
在搜索MIDI资源的过程中,我浏览了很多网站,其中 Free Midi Files Download 这个网站从资源数量以及资源组织形式这两个方面来看都是最优秀的一个,包含的音乐风格有17种之多,这之中摇滚乐的MIDI文件数目达到了9866个,通过结构化的爬取操作,这些文件的元数据(风格、歌手、歌名)都十分完整地保存在了MongoDB数据库中,方便之后的训练和测试。
爬虫的过程分为以下三个阶段:
def free_midi_get_genres():
genres_collection = get_genre_collection()
for genre in get_genres():
if genres_collection.count({'name': genre}) != 0:
continue
url = 'https://freemidi.org/genre-' + genre
text = get_html_text(url)
soup = BeautifulSoup(text, 'html.parser')
urls = []
performers = []
for item in soup.find_all(name='div', attrs={'class': 'genre-link-text'}):
try:
href = item.a['href']
name = item.text
urls.append(href)
performers.append(name)
except:
pass
genres_collection.insert_one({
'name': genre,
'performers_num': len(urls),
'performers': performers,
'performer_urls': urls
})
print(genre, len(urls))
def free_midi_get_performers():
root_url = 'https://freemidi.org/'
genres_collection = get_genre_collection()
performers_collection = get_performer_collection()
for genre in genres_collection.find({'Finished': False}):
genre_name = genre['Name']
performers = genre['Performers']
performer_urls = genre['PerformersUrls']
num = genre['PerformersNum']
for index in range(num):
name = performers[index]
url = root_url + performer_urls[index]
print(name, url)
performers_collection.insert_one({
'Name': name,
'Url': url,
'Genre': genre_name,
'Finished': False
})
genres_collection.update_one(
{'_id': genre['_id']},
{'$set': {'Finished': True}})
print('Progress: {:.2%}\n'.format(genres_collection.count({'Finished': True}) / genres_collection.count()))
def get_free_midi_songs_and_add_performers_info():
root_url = 'https://freemidi.org/'
midi_collection = get_midi_collection()
performer_collection = get_performer_collection()
while performer_collection.count({'Finished': False}) != 0:
for performer in performer_collection.find({'Finished': False}):
num = 0
performer_url = performer['Url']
performer_name = performer['Name']
genre = performer['Genre']
try:
params = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
'Cookie': cookie_str,
'Referer': root_url + genre,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive'
}
text = get_html_text(performer_url, params)
if text == '':
print('connection error')
continue
soup = BeautifulSoup(text, 'html.parser')
# print(soup)
for item in soup.find_all(name='div', attrs={'itemprop': 'tracks'}):
try:
download_url = root_url + item.span.a['href']
name = item.span.text
if midi_collection.count({'Genre': genre, 'Name': name}) == 0:
midi_collection.insert_one({
'Name': name.replace('\n', ''),
'DownloadPage': download_url,
'Performer': performer_name,
'PerformerUrl': performer_url,
'Genre': genre,
'Downloaded': False
})
num = num + 1
except:
pass
if num != 0:
performer_collection.update_one(
{'_id': performer['_id']},
{'$set': {'Finished': True, 'Num': num}}
)
time.sleep(uniform(1, 1.6))
print('Performer ' + performer_name + ' finished.')
print('Progress: {:.2%}\n'.format(performer_collection.count({'Finished': True}) / performer_collection.count()))
except:
print('Error connecting.')
def download_free_midi():
root_url = 'https://freemidi.org/'
root_path = 'E:/free_MIDI'
cookie_path = './cookies.txt'
params = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
# 'Cookie': cookie,
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive'
}
midi_collection = get_midi_collection()
session = requests.Session()
requests.packages.urllib3.disable_warnings()
session.headers.update(params)
session.cookies = cookies
while midi_collection.count({'Downloaded': False}) != 0:
for midi in midi_collection.find({'Downloaded': False}, no_cursor_timeout = True):
performer_link = midi['PerformerUrl']
download_link = midi['DownloadPage']
name = midi['Name']
genre = midi['Genre']
performer = midi['Performer']
try:
params = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
# 'Cookie': cookie_str,
'Referer': performer_link,
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive'
}
session.headers.update({'Referer': performer_link})
r = session.get(download_link, verify=False, timeout=20)
# r.encoding = 'utf-8'
if r.cookies.get_dict():
print(r.cookies.get_dict())
session.cookies = r.cookies
if r.status_code != 200:
print('connection error ' + str(r.status_code))
soup = BeautifulSoup(r.text, 'html.parser')
r.close()
try:
getter_link = root_url + soup.find(name='a', attrs={'id': 'downloadmidi'})['href']
print(getter_link)
download_header = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Referer': download_link,
# 'Cookie': cookie_str,
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
}
session.headers.update(download_header)
dir = root_path + '/' + genre
if not os.path.exists(dir):
os.mkdir(dir)
rstr = r'[\\/:*?"<>|\r\n\t]+' # '/ \ : * ? " < > |'
name = re.sub(rstr, '', name).strip()
performer = re.sub(rstr, '', performer).strip()
file_name = name + ' - ' + performer + '.mid'
path = dir + '/' + file_name
try:
with open(path, 'wb') as output:
with session.get(getter_link, allow_redirects=True, verify=False, timeout=20) as r:
if r.history:
print('Request was redirected')
for resp in r.history:
print(resp.url)
print('Final: ' + str(r.url))
r.raise_for_status()
if r.cookies.get_dict():
print(r.cookies)
session.cookies.update(r.coo![在这里插入图片描述](https://img-blog.csdnimg.cn/20200327182437531.jpg?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L1RydWVkaWNrRGluZw==,size_16,color_FFFFFF,t_70)kies)
output.write(r.content)
time.sleep(uniform(2, 3))
# cookie_opener.open(getter_link)
# cj.save(cookie_path, ignore_discard=True)
if is_valid_midi(path):
print(file_name + ' downloaded')
midi_collection.update_one(
{'_id': midi['_id']},
{'$set': {'Downloaded': True, 'GetterLink': getter_link}}
)
print('Progress: {:.2%}\n'.format(midi_collection.count({'Downloaded': True}) / midi_collection.count()))
else:
print('Cannot successfully download midi.')
os.remove(path)
except:
print(traceback.format_exc())
except:
print('Found no download link')
except:
print(traceback.format_exc())
爬取到的MIDI文件夹结构如下,其中每个子文件夹代表不同的风格:
每个子文件夹内就包含该风格的所有MIDI文件:
为了方便处理,我把所有的文件名通过md5算法加密了,并将对应的哈希码保存在数据表,可以通过简单的find语句来查找,哈希化代码保存在 Free-Midi-Library/src/md5_reorganize.py/
为了使得训练效果更佳,我将所有的MIDI音乐的速度调整到120bpm,并转调到C调,这两种操作的代码在 src/unify_tempo.py 和 src/transpose_tone.py 可以找到。
def transpose_to_c():
root_dir = 'E:/free_midi_library/'
transpose_root_dir = 'E:/transposed_midi/'
midi_collection = get_midi_collection()
for midi in midi_collection.find({'Transposed': False}, no_cursor_timeout = True):
original_path = os.path.join(root_dir, midi['Genre'] + '/', midi['md5'] + '.mid')
if not os.path.exists(os.path.join(transpose_root_dir, midi['Genre'])):
os.mkdir(os.path.join(transpose_root_dir, midi['Genre']))
transposed_path = os.path.join(transpose_root_dir, midi['Genre'] + '/', midi['md5'] + '.mid')
try:
original_stream = converter.parse(original_path)
estimate_key = original_stream.analyze('key')
estimate_tone, estimate_mode = (estimate_key.tonic, estimate_key.mode)
c_key = key.Key('C', 'major')
c_tone, c_mode = (c_key.tonic, c_key.mode)
margin = interval.Interval(estimate_tone, c_tone)
semitones = margin.semitones
mid = pretty_midi.PrettyMIDI(original_path)
for instr in mid.instruments:
if not instr.is_drum:
for note in instr.notes:
if note.pitch + semitones < 128 and note.pitch + semitones > 0:
note.pitch += semitones
mid.write(transposed_path)
midi_collection.update_one({'_id': midi['_id']}, {'$set': {'Transposed': True}})
print('Progress: {:.2%}\n'.format(midi_collection.count({'Transposed': True}) / midi_collection.count()))
except:
print(traceback.format_exc())
这一函数中,首先通过music21.converter库中的调性分析函数来得到MIDI文件的调性,并根据与C调的距离来将其转调到C大调或C小调
实例
转调前:
转调后:
def tempo_unify_and_merge():
midi_collection = get_midi_collection()
root_dir = 'E:/transposed_midi/'
merged_root_dir = 'E:/merged_midi/'
for midi in midi_collection.find({'MergedAndScaled': False}, no_cursor_timeout = True):
original_path = os.path.join(root_dir, midi['Genre'] + '/', midi['md5'] + '.mid')
try:
original_tempo = get_tempo(original_path)[0]
changed_rate = original_tempo / 120
if not os.path.exists(os.path.join(merged_root_dir, midi['Genre'])):
os.mkdir(os.path.join(merged_root_dir, midi['Genre']))
pm = pretty_midi.PrettyMIDI(original_path)
for instr in pm.instruments:
for note in instr.notes:
note.start *= changed_rate
note.end *= changed_rate
merged_path = os.path.join(merged_root_dir, midi['Genre'] + '/', midi['md5'] + '.mid')
merged = get_merged_from_pm(pm)
merged.write(merged_path)
midi_collection.update_one({'_id': midi['_id']}, {'$set': {'MergedAndScaled': True}})
print('Progress: {:.2%}\n'.format(midi_collection.count({'MergedAndScaled': True}) / midi_collection.count()))
except:
pass
这一函数使用了 pretty_midi 库支持的对MIDI文件的操作,根据源文件的BPM与120BPM的比例,来对所有Note的起始时间和终止时间来进行改变。
实例:
统一速度后:
百度云下载链接,提取码:fm8f
资源介绍: