Python中pyBibx库是文献计量学分析较好的库,有着丰富的方法。
1 安装
pip install pyBibx
2 数据
见本文绑定资源
3 实战分析
3.1 导入数据和总结汇报
import numpy as np
import pandas as pd
import textwrap
from pyBibX.base import pbx_probe
from prettytable import PrettyTable
file_name = 'pubmed.txt'
database = 'pubmed'
# load data
bibfile = pbx_probe(file_bib=file_name, db=database, del_duplicated=True)
# Generate EDA (Exploratory Data Analysis) Report
report = bibfile.eda_bib()
# Check Report
report
3.2 词云
bibfile.word_cloud_plot(entry='kwa', size_x=15, size_y=10, wordsn=500)
3.3 词语重要性排序
# Check Table for important word
table = PrettyTable()
data_wd = bibfile.ask_gpt_wd
table.field_names = ['Word', 'Importance']
for key, value in data_wd.items():
table.add_row([key, round(value, 4)])
print(table)
3.4 N-Grams:是文本文档中n个连续项目的集合,可能包括单词、数字、符号和标点符号。
bibfile.get_top_ngrams(view='notebook', entry='kwp', ngrams=4, stop_words=[], rmv_custom_words=[], wordsn=15)
3.5 文章聚类
projection, labels = bibfile.docs_projection(view='notebook',corpus_type='abs',
stop_words=['en'],rmv_custom_words=[],
custom_label=[],custom_projection=[],
n_components=2,n_clusters=5,
tf_idf=False,embeddings=False,
method='umap')
data_pr = pd.DataFrame(np.hstack([projection, labels.reshape(-1,1)]))
# Check Articles per Cluster
cluster = 1
idx_articles = [i for i in range(0, labels.shape[0]) if labels[i] == cluster]
print(*idx_articles, sep=', ')
3.6 每年关键词变化
bibfile.plot_evolution_year(view='notebook',
stop_words=['en'],
rmv_custom_words=[],
key='kwp',
topn=10,
start=2010,
end=2021)
# View Table
data_ep = bibfile.ask_gpt_ep
3.7 作者/国家/杂志桑基图
bibfile.sankey_diagram(view='notebook', entry=['aut', 'cout', 'inst', 'lan'], topn=10)
# View Table
data_sk = bibfile.ask_gpt_sk
pd.DataFrame(data_sk)
3.8 杂志热图
bibfile.tree_map(entry='jou', topn=20, size_x=30, size_y=30)
3.9 作者产出图
bibfile.authors_productivity(view='notebook', topn=20)
3.10 作者产出柱状图
bibfile.plot_bars(statistic='apd', topn=20, size_x=15, size_y=10)
# View Table
data_bp = bibfile.ask_gpt_bp
3.11 合作网络图
bibfile.network_adj(view = 'notebook', adj_type = 'aut', min_count = 5, node_labels = True, label_type = 'name', centrality = None)
# PS: If a centrality criterion is used then the values can be obtained by the following command: bibfile.table_centr
# View Table
data_adj = bibfile.ask_gpt_adj
bibfile.find_nodes(node_ids = [], node_name = ['youngkong s'], node_only = False)
3.12 合作世界地图
bibfile.network_adj_map(view = 'browser', connections = True, country_lst = [])
3.13 NLP:自然语言处理
# NLP
# Arguments: corpus_type = 'abs', 'title', 'kwa', or 'kwp';
# stop_words = A list of stopwords to clean the corpus. ['ar', 'bn', 'bg', 'cs', 'en', 'fi', 'fr', 'de', 'el', 'hi', 'he', 'hu', 'it', 'ja', 'ko', 'mr', 'fa', 'pl', 'pt-br', 'ro', 'ru', 'es', 'sv', 'sk', 'zh', 'th', 'uk'];
# 'ar' = Arabic; 'bn' = Bengali; 'bg' = Bulgarian; 'cs' = Czech; 'en' = English; 'fi' = Finnish; 'fr' = French; 'de' = German; 'el' = Greek; 'he' = Hebrew;…n;
# 'ja' = Japanese; 'ko' = Korean; 'mr' = Marathi; 'fa' = Persian; 'pl' = Polish; 'pt-br' = Potuguese-Brazilian; 'ro' = Romanian; 'ru' = Russian; 'es' = Spanish; 'sk' = Slovak; 'sv' = Swedish;
# 'zh' = Chinese; 'th' = Thai; 'uk' = Ukrainian
# rmv_custom_words = A list of custom stopwords to clean the corpus;
bibfile.create_embeddings(stop_words = ['en'], rmv_custom_words = [], corpus_type = 'abs')
emb = bibfile.embds
# NLP #-1 refers to all outliers and should typically be ignored.
# Arguments: stop_words = A list of stopwords to clean the corpus. ['ar', 'bn', 'bg', 'cs', 'en', 'fi', 'fr', 'de', 'el', 'hi', 'he', 'hu', 'it', 'ja', 'ko', 'mr', 'fa', 'pl', 'pt-br', 'ro', 'ru', 'es', 'sv', 'sk', 'zh', 'th', 'uk'];
# 'ar' = Arabic; 'bn' = Bengali; 'bg' = Bulgarian; 'cs' = Czech; 'en' = English; 'fi' = Finnish; 'fr' = French; 'de' = German; 'el' = Greek; 'he' = Hebrew;'hi' = Hindi; 'hu' = Hungarian; 'it' = Italian;
# 'ja' = Japanese; 'ko' = Korean; 'mr' = Marathi; 'fa' = Persian; 'pl' = Polish; 'pt-br' = Potuguese-Brazilian; 'ro' = Romanian; 'ru' = Russian; 'es' = Spanish; 'sk' = Slovak; 'sv' = Swedish;
# 'zh' = Chinese; 'th' = Thai; 'uk' = Ukrainian
# rmv_custom_words = A list of custom stopwords to clean the corpus;
# embeddings = True or False. If True then word embeddings are used to create the topics
bibfile.topics_creation(stop_words = ['en'], rmv_custom_words = [], embeddings = True)
# NLP
# Each document Topic
topics = bibfile.topics
# NLP
# Each document Probability to belong a Topic
probs = bibfile.probs
# NLP
# Arguments: view = 'notebook', 'browser' ('notebook' -> To plot in your prefered Notebook App. 'browser' -> To plot in your prefered browser window)
bibfile.graph_topics_distribution(view = 'notebook')
# NLP
# Arguments: view = 'notebook', 'browser' ('notebook' -> To plot in your prefered Notebook App. 'browser' -> To plot in your prefered browser window)
bibfile.graph_topics(view = 'notebook')
# NLP
# Arguments: view = 'notebook', 'browser' ('notebook' -> To plot in your prefered Notebook App. 'browser' -> To plot in your prefered browser window)
bibfile.graph_topics_projection(view = 'notebook')
# NLP
# Arguments: view = 'notebook', 'browser' ('notebook' -> To plot in your prefered Notebook App. 'browser' -> To plot in your prefered browser window)
bibfile.graph_topics_heatmap(view = 'notebook')
# NLP
similar_topics, similarity = bibfile.topic_model.find_topics('electre', top_n = 10)
for i in range(0, len(similar_topics)):
print('Topic: ', similar_topics[i], 'Correlation: ', round(similarity[i], 3))
# NLP
bibfile.topic_model.save('my_topic_model')
abs_summary = bibfile.summarize_abst_peg(article_ids = [305, 34, 176], model_name = './pegasus-xsum')
# NLP - Check Abstractive Summarization
print(textwrap.fill(abs_summary, 150))
abs_summary_chat = bibfile.summarize_abst_chatgpt(article_ids = [305, 34, 176], join_articles = True, api_key = 'your_api_key_here', query = 'from the following scientific abstracts, summarize the main information in a single paragraph using around 250 words', model = 'gpt-4')
# NLP - Check Abstractive Summarization
print(textwrap.fill(abs_summary_chat, 250))
# NLP - Extractive Summarization
# Arguments: article_ids = A list of documents to perform an extractive summarization with the available abstracts. If the list is empty then all documents will be used
ext_summary = bibfile.summarize_ext_bert(article_ids = [305, 34, 176])
# NLP - Check Extractive Summarization
print(textwrap.fill(ext_summary, 150))
上述可总结摘要
3.14 筛选文章
bibfile.filter_bib(documents = [], doc_type = [], year_str = -1, year_end = -1, sources = [], core = -1, country = [], language = [], abstract = False)
感兴趣的可以参考原始指南:
https://colab.research.google.com/drive/13CU-KvZMnazga1BmQf2J8wYM9mhHL2e1?usp=sharing#scrollTo=_11EAT72ED4N