英文维基百科Python查询API

问题描述

维基百科语料库是做文本挖掘和自然语言处理相关实验的一个非常重要的公开可获取大规模语料库(知识库),有时需要对其进行检索并对获取结果页面的相关信息,如正文文本、标题、页面文本长度等。一种常见的做法是把维基百科语料的dump下载到本机使用,但是这种做法对本机的性能(内存、硬盘)等有一定要求,并且要自己进行众多繁琐的预处理,如果要进行检索,则还需要在本地实现检索功能,显然需要的预处理工作量有点大。如果我们不是使用大规模维基百科语料进行模型训练(如预训练模型),那么可以参考维基百科提供的API接口快速调用相关功能,如,检索、页面文本获取、解析页面的各种信息等。目前有两个常用的Python包可以实现上述功能,Wikipedia-API和wikipedia,博主试用后发现上述两个包的search功能实现的不是特别好,于是根据自己手头的实验需求并参考维基百科的API文档,写了一个相对简单的脚本,实现英文维基百科检索并获取全部结果页面信息获取页面文本获取根据页面ID或页面标题获取页面URL。代码如下,分享给有需要的小伙伴参考使用。

解决办法

import requests
from urllib import parse


class SearchError(Exception):
	# Exception raised when the returned result dict's keys includes `warnings` or `error`
	def __init__(self, exception_type, except_info, url):
		super().__init__(self)
		self.except_info = except_info
		self.url = url
		self.exception_type = exception_type
	
	def __str__(self):
		return "There is {} when call the URL:\n{}\nThe detailed {} information is as follow:\n{}".format(
			self.exception_type, self.url, self.exception_type, self.except_info)


def wiki_search(query):
	# for more API information, refer to 'https://www.mediawiki.org/wiki/API:Search'
	# The search results may be a little different in each time.
	para_dict = {
		'action': 'query',
		'list': 'search',
		'srsearch': query,
		'utf8': '',
		'format': 'json',
		'srsort': 'relevance',
		'srprop': 'wordcount',
		'srlimit': 500,
		'sroffset': 0
	}
	# if there are many pages of results, parameter sroffset in the next call should be sroffset + srlimit
	url = 'https://en.wikipedia.org/w/api.php?' + parse.urlencode(para_dict)
	# print(url)
	response = requests.get(url)
	if response.status_code == 200:
		result_dict = eval(response.content)
		totalhits = result_dict['query']['searchinfo']['totalhits']
		# print(totalhits)
		result_list = result_dict['query']['search']
		# dumpJson('./search_result.json',result_list)
		return result_list
	else:
		return None


def search_by_id(pageid):
	para_dict = {
		'action': 'query',
		'prop': 'info',
		'pageids': pageid,
		'inprop': 'url',
		'format': 'json'
	}
	url = 'https://en.wikipedia.org/w/api.php?' + parse.urlencode(para_dict)
	# print(url)
	response = requests.get(url)
	if response.status_code == 200:
		result_dict = eval(response.content)
		
		if 'error' in result_dict.keys():
			raise SearchError('ERROR', result_dict['error'], url)
		if 'warnings' in result_dict.keys():
			raise SearchError('warning', result_dict['warnings'], url)
		
		title = result_dict['query']['pages'][pageid]['title']
		page_url = result_dict['query']['pages'][pageid]['fullurl']
		# print(pageid, title, page_url)
		return title, page_url
	else:
		return None, None


def search_by_title(title):
	# There is no such page if the returned pageid = -1
	para_dict = {
		'action': 'query',
		'prop': 'info',
		'titles': title,
		'inprop': 'url',
		'format': 'json'
	}
	url = 'https://en.wikipedia.org/w/api.php?' + parse.urlencode(para_dict)
	# print(url)
	response = requests.get(url)
	if response.status_code == 200:
		result_dict = eval(response.content)
		
		if 'error' in result_dict.keys():
			raise SearchError('ERROR', result_dict['error'], url)
		if 'warnings' in result_dict.keys():
			raise SearchError('warning', result_dict['warnings'], url)
		
		pageid = list(result_dict['query']['pages'].keys())[0]
		title = result_dict['query']['pages'][pageid]['title']
		page_url = result_dict['query']['pages'][pageid]['fullurl']
		# print(pageid, title, page_url)
		return pageid, page_url
	else:
		return None, None


def page_text(pageid):
	# For more API information, refer to 'https://www.mediawiki.org/wiki/Extension:TextExtracts#API'
	para_dict = {
		'action': 'query',
		'prop': 'extracts',
		'pageids': pageid,
		'explaintext': '',
		'format': 'json'
	}
	url = 'https://en.wikipedia.org/w/api.php?' + parse.urlencode(para_dict)
	# print(url)
	response = requests.get(url)
	if response.status_code == 200:
		result_dict = eval(response.content)
		
		if 'error' in result_dict.keys():
			raise SearchError('ERROR', result_dict['error'], url)
		if 'warnings' in result_dict.keys():
			raise SearchError('warning', result_dict['warnings'], url)
		
		title = result_dict['query']['pages'][pageid]['title']
		text = result_dict['query']['pages'][pageid]['extract']
		# print(pageid, title, text)
		return title, text
	else:
		return None, None


def page_info(pageid):
	# for more API information, refer to 'https://www.mediawiki.org/wiki/API:Parsing_wikitext'
	para_dict = {
		'action': 'parse',
		'pageid': pageid,
		'format': 'json'
	}
	url = 'https://en.wikipedia.org/w/api.php?' + parse.urlencode(para_dict)
	# print(url)
	response = requests.get(url)
	if response.status_code == 200:
		result_dict = eval(response.content)
		if 'error' in result_dict.keys():
			raise SearchError('ERROR', result_dict['error'], url)
		if 'warnings' in result_dict.keys():
			raise SearchError('warning', result_dict['warnings'], url)
		parse_result = result_dict['parse']
		title = result_dict['parse']['title']
		pageid = result_dict['parse']['pageid']
		# print(pageid, title)
		return parse_result
	else:
		return None

使用示例如下:

result_list = wiki_search(query='trump')
title = result_list[1]['title']
pageid = str(result_list[1]['pageid'])
print(title, pageid)
print(search_by_id(pageid=pageid))
print(search_by_title(title=title))
print(page_text(pageid=pageid))

示例结果如下:

Donald Trump 4848272
('Donald Trump', 'https://en.wikipedia.org/wiki/Donald_Trump')
('4848272', 'https://en.wikipedia.org/wiki/Donald_Trump')
('Donald Trump', 'Donald John Trump (born June 14, 1946) is ...

代码说明

  • 上述代码主要就是用requests模块调用英文维基百科的API,并不像上面两个python package那样完善和严谨,读者可根据自己的需求修改API参数或脚本代码获取自己想要的结果。
  • 如有大规模、高频率调用需求,请勿使用上述代码以及提及的两个python包,会被限制连接的。

参考资料

[1] wikipedia package: https://pypi.org/project/wikipedia/
[2] Wikipedia-API: https://pypi.org/project/Wikipedia-API/
[3] https://www.mediawiki.org/wiki/API:Search
[4] https://www.mediawiki.org/wiki/Extension:TextExtracts#API
[5] https://www.mediawiki.org/wiki/API:Parsing_wikitext

你可能感兴趣的:(python,文本挖掘,自然语言处理,维基百科API,英文维基百科API,wikipedia,API,维基百科Python,API,维基百科搜索API)