读取网站主页整个页面的 html 内容并解码为文本串(可使用urllib.request的相应方法),将其以UTF-8编码格式写入page.txt文件。
import urllib.request
with urllib.request.urlopen('https://dblp.dagstuhl.de/db/conf/kdd/kdd2023.html') as response:
html = response.read()
html_text = html.decode()
with open('page.txt','w',encoding='utf-8') as f:
f.write(html_text)
打开page.txt文件,观察 Track 名称、论文标题等关键元素的组成规律。从这个文本串中提取各Track 的名称并输出(可利用字符串类型的split()和strip()方法)。
import re
with open('page.txt', 'r', encoding='utf-8') as f:
content = f.read()
# 使用正则表达式找到所有的 和
之间的字符串
matches = re.findall(r'(.*?)
', content)
for match in matches:
print(match)
def extract(text):
pieces = text.split('>')
if len(pieces) == 1 and len(pieces[0].split('<')) == 1:
return text
if len(pieces[0].split('<')) < 2:
pieces[0] = '<' + pieces[0]
pieces = [piece.split('<')[0] for piece in pieces]
output = ''.join(pieces)
return output
track_part = html_text.split('' )[1:]
track_name_text = [part.split("")[0] for part in track_part]
track_name = [extract(name_text) for name_text in track_name_text]
track_name
Research Track Full Papers
Applied Data Track Full Papers
Hands On Tutorials
Lecture Style Tutorials
Workshop Summaries
可以看到, “Research Track Full Papers” 和 “Applied Data Track Full Papers” 中的论文占据了绝大多数,现欲提取这两个 Track 下的所有论文信息(包含作者列表authors、论文标题title、收录起始页startPage与终止页endPage),并按照以下格式存储到一个字典列表中,同时输出这两个 Track 各自包含的论文数量,然后把字典列表转化为 json 对象(可使用json包的相应方法),并以 2 字符缩进的方式写入kdd23.json文件中。
[
{
"track": "Research Track Full Papers",
"papers": [
{
"authors": [
"Florian Adriaens",
"Honglian Wang",
"Aristides Gionis"
],
"title": "Minimizing Hitting Time between Disparate Groups with Shortcut Edges.",
"startPage": 1,
"endPage": 10
},
...
]
}
{
"track": "Applied Data Track Full Papers",
"papers": [
{
"authors": [
"Florian Adriaens",
"Honglian Wang",
"Aristides Gionis"
],
"title": "Minimizing Hitting Time between Disparate Groups with Shortcut Edges.",
"startPage": 1,
"endPage": 10
},
...
]
}
]
import re
import json
with open('page.txt', 'r', encoding='utf-8') as f:
content = f.read()
# 定义一个列表来存储 Track 信息
tracks = []
# 定义正则表达式
track_pattern = re.compile(r'(.*?)
')
author_pattern = re.compile(r'(.*?)')
title_pattern = re.compile(r'(.*?)')
page_pattern = re.compile(r'(.*?)-(.*?)')
# 找到 "Research Track Full Papers" 和 "Applied Data Science Track Full Papers" 的位置
start1 = content.find('Research Track Full Papers') - 50
start2 = content.find('Applied Data Track Full Papers') - 50
start3 = content.find('Hands On Tutorials') - 1
# 从整篇文本中划分出前两个Track中所有相邻""之间的内容(即一篇文章的范围)
research_papers_content = re.split(', content[start1:start2])[1:]
applied_papers_content = re.split(', content[start2:start3])[1:]
def extract_paper_info(papers_content):
papers = []
for paper_content in papers_content:
paper_content = re.split('', paper_content)[0]
papers.append(paper_content)
return papers
spit_research_content = extract_paper_info(research_papers_content)
spit_applied_content = extract_paper_info(applied_papers_content)
print("Number of research papers: ", len(research_papers_content))
print("Number of applied papers: ", len(applied_papers_content))
# 提取每篇paper的author、title和startPage, endPage
def extract_paper_info(papers_content):
papers = []
for paper_content in papers_content:
authors = author_pattern.findall(paper_content)
titles = title_pattern.findall(paper_content)
pages = page_pattern.search(paper_content)
startPage, endPage = map(int, pages.groups())
papers.extend([{'authors': authors, 'title': title , 'startPage': startPage , 'endPage': endPage} for title in titles])
return papers
# 提取 "Research Track Full Papers" 的论文信息
research_track = track_pattern.search(content[start1:start2]).group(1)
research_papers = extract_paper_info(spit_research_content)
# 提取 "Applied Data Science Track Full Papers" 的论文信息
applied_track = track_pattern.search(content[start2:start3]).group(1)
#applied_papers = extract_paper_info(spit_applied_content)
applied_papers = extract_paper_info(spit_applied_content)
# 将论文信息存储到字典列表中
tracks.append({'track': research_track, 'papers': research_papers})
tracks.append({'track': applied_track, 'papers': applied_papers})
# 将字典列表转换为 JSON 并写入文件
with open('kdd23.json', 'w', encoding='utf-8') as f:
json.dump(tracks, f, indent=2)
def sep_paper(text):
author_info, title_info = text.split('')
title_info, pages_info = title_info.split('')
authors = extract(author_info)
title = extract(title_info)
pages = extract(pages_info).split('-')
output = {
"authors": authors.strip().strip(':').split(', '),
"title": title.strip(),
"startPage": int(pages[0]),
"endPage": int(pages[1])
}
return output
track_candidate = ["Research Track Full Papers", "Applied Data Track Full Papers"]
track_obj = []
forward_authors = [] # For Task 5
for i in range(len(track_name)):
if track_name[i] in track_candidate:
track_info = track_part[i]
papers_info = track_info.split(')[1:]
papers_info = [info.split('')[1] for info in papers_info]
papers = [sep_paper(paper) for paper in papers_info]
print(len(papers))
track_obj.append({"track": track_name[i], "papers": papers})
forward_authors.extend(papers_info[:10]) # For Task 4
import json
track_str = json.dumps(track_obj, indent=2)
with open("kdd23.json", "w", encoding="UTF-8") as file:
file.write(track_str)
Number of research papers: 313
Number of applied papers: 183
kdd23.json文件共有6404行。
基于之前爬取的页面文本,分别针对这两个 Track 前 10 篇论文的所有相关作者,爬取他们的以下信息:(1)该研究者的学术标识符orcID(有多个则全部爬取);(2)该研究者从 2020 年至今发表的所有论文信息(包含作者authors、标题title、收录信息publishInfo和年份year)。将最终结果转化为 json 对象,并以 2 字符缩进的方式写入researchers.json文件中,相应存储格式为:
[
{
"researcher": "Florian Adriaens",
"orcID": [
"0000-0001-7820-6883"
],
"papers": [
{
"authors": [
"Florian Adriaens",
"Honglian Wang",
"Aristides Gionis"
],
"title": "Minimizing Hitting Time between Disparate Groups with Shortcut Edges.",
"publishInfo": "KDD 2023: 1-10",
"year": 2023
},
...
]
},
...
]
import re
import requests
import json
import time
import random
# 打开并读取 "page.txt" 文件
with open('page.txt', 'r', encoding='utf-8') as f:
content = f.read()
# 定义正则表达式
author_link_pattern = re.compile(r'')
orcID_pattern = re.compile(r'(.{19})')
researcher_pattern = re.compile(r'(.*?) ')
year_pattern = re.compile(r'(.*?)')
# 找到 "Research Track Full Papers" 和 "Applied Data Track Full Papers" 的位置
start1 = content.find('Research Track Full Papers')
start2 = content.find('Applied Data Track Full Papers')
end = len(content)
# 提取这两个部分的内容,并找到前 10 个 "persistent URL:" 之间的内容
research_papers_content = content[start1:start2].split(')[1:11]
applied_papers_content = content[start2:end].split(')[1:11]
def extract_paper_info(papers_content):
papers = []
for paper_content in papers_content:
paper_content = re.split('', paper_content)[0]
papers.append(paper_content)
return papers
spit_research_content = extract_paper_info(research_papers_content)
spit_applied_content = extract_paper_info(applied_papers_content)
def extract_paper_info2(paper_content):
final_result = []
# 使用正则表达式找到所有在 "<>" 之外的字符串
outside_brackets = re.split(r'<[^>]*>', paper_content)
# 遍历提取到的内容,删除含有'http'的字符串及其前面的字符串
flag = -1
for i in range(len(outside_brackets)):
if 'http' in outside_brackets[i]:
flag = i
for i in range(flag + 1 , len(outside_brackets)):
if outside_brackets[i]:
final_result.append(outside_brackets[i])
return final_result
# 定义一个列表来存储研究者信息
researchers = []
# 访问每篇文章里所有作者的链接,获取作者的 orcID 和论文信息
for papers in [research_papers_content, applied_papers_content]:
for paper in papers:
author_links = author_link_pattern.findall(paper)
for link in author_links:
link_content = requests.get(link)
response = link_content.text
#爬虫时频繁请求服务器,可能会被网站认定为攻击行为并报错"ConnectionResetError: [WinError 10054] 远程主机强迫关闭了一个现有的连接",故采取以下两个措施
#使用完后关闭响应
link_content.close()
# 在各个请求之间添加随机延时等待
time.sleep(random.randint(1, 3))
tempmatch = researcher_pattern.search(response)
if tempmatch is not None:
researcher = tempmatch.group(1)
else:
researcher_pattern1 = re.compile(r'(.*?)')
researcher = researcher_pattern1.search(response).group(1)
orcID = orcID_pattern.findall(response)
# 找到 "" 和 " " 之间的内容
start = response.find('2020 – today')
end = response.find(')
# 提取这部分的内容,并找到所有 "" 之间的内容
papers_content = response[start:end].split('')[0:-1]
papers_dict = []
for paper_content in papers_content:
spit_content = extract_paper_info2(paper_content)
year = int(year_pattern.search(paper_content).group(1))
authors = []
publishInfo = []
for i in range(0 , len(spit_content) - 1):
if spit_content[i] != ", " and (spit_content[i+1] == ", " or spit_content[i+1] == ":"):
authors.append(spit_content[i])
elif spit_content[i-2] == ":" and spit_content[i-1] == " ":
title = spit_content[i]
for k in range(i+2 , len(spit_content)):
publishInfo.append(spit_content[k])
# 创建一个新的字典来存储每篇文章的信息
paper_dict = {'authors': authors, 'title': title, 'publishInfo': ''.join(publishInfo), 'year': year}
papers_dict.append(paper_dict)
researchers.append({'researcher': researcher, 'orcID': orcID, 'papers': papers_dict})
# 将字典列表转换为 JSON 并写入 "researchers.json" 文件
with open('researchers.json', 'w', encoding='utf-8') as f:
json.dump(researchers, f, indent=2)
authors_info = "".join(forward_authors).split(')[1:]
authors_urls = [author.split('" itemprop="url">')[0] for author in authors_info]
authors_urls = list(set(authors_urls))
researchers_obj = []
def sep_paper_new(text, year):
coauthor_info, title_info = text.split('')
title_info, *pub_info = title_info.split('')
pub_info = ''.join(pub_info).split('')[0]
coauthors = extract(coauthor_info).strip().strip(':').split(', ')
title = extract(title_info)
pub = extract(pub_info).strip().strip('[content]')
output = {
"authors": coauthors,
"title": title,
"publishInfo": pub,
"year": int(year),
}
return output
for i in range(len(authors_urls)):
print(f"{i+1}/{len(authors_urls)}")
author_url = authors_urls[i]
author_html = urlopen(url=author_url)
author_text = bytes.decode(author_html.read())
author_info = author_text.split('')[1]
researcher = author_info.split('<')[0].strip('"').strip()
try:
orcID_info = author_info.split('' )[1]
orcID_info = orcID_info.split('
截至2024年3月30日,去除重复作者前用第一份代码读取的researchers.json文件共88448行,去重后共83685行,但是第二份代码爬取的researchers.json文件为84785行,其中两篇代码爬取的作者链接都是105条,目前尚不清楚差异产生的原因。
用python找出两个json文件的差异之处的代码如下:
import json
import difflib
def find_json_differences(file1, file2):
with open(file1, 'r') as f1, open(file2, 'r') as f2:
data1 = json.load(f1)
data2 = json.load(f2)
diff = list(difflib.ndiff(json.dumps(data1, indent=2).splitlines(), json.dumps(data2, indent=2).splitlines()))
for line in diff:
print(line)
"""
+表示该行存在于file2中,但不存在于file1中。
-表示该行存在于file1中,但不存在于file2中。
?表示两个文件在这一行有微小的差异,例如空格或者标点符号的不同。
"""
# 指定要比较的两个 JSON 文件路径
file1 = 'researchers.json'
file2 = r'F:\Desktop\researchers.json'
find_json_differences(file1, file2)
如果是为了显示两个一般文件的差异,可以直接用以下代码,unified_diff()模块会显示具体哪一行有差异:
import difflib
def find_differences(file1, file2):
with open(file1, 'r') as f1, open(file2, 'r') as f2:
lines1 = f1.readlines()
lines2 = f2.readlines()
diff = difflib.unified_diff(lines1, lines2)
for line in diff:
print(line)
# 指定要比较的两个文件路径
file1 = 'file1.txt'
file2 = 'file2.txt'
find_differences(file1, file2)