wq
在数据解析之前,需要对数据中的异常文本(, ,,等文本修饰符标签)进行预处理,例如下文,
Background: Lung adenocarcinoma has a strong tendency to develop
into bone metastases, especially spinal metastases (SM). Long noncoding RNAs (lncRNAs) play
critical roles in regulating several biological processes in cancer cells. However, the
mechanisms underlying the roles of lncRNAs in the development of SM have not been
elucidated to date. Methods: Clinical specimens were collected for analysis of differentially expressed lncRNAs. The Kyoto Encyclopedia of Genes and Genomes (KEGG) was
used to examine the effects of these genes on pathways. RNA pull-down was utilized to
identify the targeting protein of lncRNAs. The effects of lncRNA on its target were
detected in A549 and SPCA-1 cells via perturbation of the lncRNA expression. Oncological
behavioral changes in transfected cells and phosphorylation of kinases in the relevant
pathways, with or without inhibitors, were observed. Further, tumorigenicity was found to
occur in experimental nude mice. Results: LINC00852s2 and the mitogen-activated
protein kinase (MAPK) pathway were found to be associated with SM. Moreover, the LINC00852
target S100A9 had a positive regulatory role in the progression, migration, invasion, and
metastasis of lung adenocarcinoma cells, both in vitro and in vivo.
Furthermore, S100A9 strongly activated the P38 and REK1/2 kinases, and slightly activated
the phosphorylation of the JNK kinase in the MAPK pathway in A549 and SPCA-1 cells.
Conclusion: LINC00852 targets S100A9 to promote progression and oncogenic ability in
lung adenocarcinoma SM through activation of the MAPK pathway. These findings suggest a
potential novel target for early intervention against SM in lung cancer.
具体解决方法就是在解析之前进行预处理,然后再解析,本人要处理的数据量较大,所以采用数组进行了批量处理:
import os
from variation_preprocess.pubmed_test import xml_parser
source_dir = 'G:\\Pubmed_file\\'
List_Fname = []
List_Sname = []
List_csvname = []
def listdir(path, list_Fname, list_Sname, list_csv):
for file in os.listdir(path):
if file[-4:] == '.xml':
file_path = 'G:/Pubmed_file/'+ file[:-4] + '.xml'
file_save = 'G:/PubMed/'+ file[:-4] + '_edited.xml'
file_csv = 'E:/PubMed/' + file[:-4] + '.csv'
list_Fname.append(file_path)
list_Sname.append(file_save)
list_csv.append(file_csv)
return list_Fname, list_Sname, list_csv
# 去除 sub, sub 标签,
def xml_process(list_Pname, list_Sname):
for file_p in range(len(list_Pname)):
file_path = list_Pname[file_p]
temp_save = list_Sname[file_p]
print(file_path, temp_save)
with open(file_path, 'r', encoding='utf-8') as tf:
with open(temp_save, 'a+', encoding='utf-8') as sf:
data = tf.readlines()
for index in data:
index = index.replace('','^').replace('','')
index_f = index.replace('','_').replace('','')
sf.write(index_f)
if __name__ == '__main__':
list_Fname, list_Sname, list_csvname = listdir(source_dir, List_Fname, List_Sname, List_csvname)
# xml_process(list_Fname,list_Sname)
for file_index in range(len(list_Fname)):
file_direc = list_Sname[file_index]
file_save = list_csvname[file_index]
xml_parser(file_direc, file_save)
然后就可以处理数据了,这里,我采用了SAX方法来解析xml 文档,因为它是非常适合处理批量数据的
# -*- coding:UTF-8 -*-
import xml.sax
import pandas as pd
global i
i = 0
class SaxHandler(xml.sax.ContentHandler) :
def __init__(self) :
self.Pubmed = []
self.CurrentData = ''
self.PMID = ''
self.ISSN = ''
self.date = ''
self.Date_year = ''
self.Date_month = ''
self.Date_day = ''
self.DateCompleted = ''
self.DateCompleted_Month = ''
self.DateRevised_Month = ''
self.DateRevised = ''
self.IssnType = ''
self.CitedMedium = ''
self.ArticleType = ''
self.ISOAbbreviation = ''
self.Journal_Title = ''
self.ArticleTitle = ''
self.ELocationID = ''
self.AbstractText = ''
self.Author = ''
self.Author_full = ''
self.LastName = ''
self.ForeName = ''
self.Initials = ''
self.Identifier = ''
self.Affiliation = ''
self.Keywords = ''
self.Language = ''
self.PublicationType = ''
self.Tags = ['a', 'b', 'c', 'd', 'e']
def startElement(self, tag, attributes) :
global i
# print("tag",tag, i)
self.Tags.append(tag)
if len(self.Tags) < 2:
return
if self.Tags[-2] == "Journal" and self.Tags[-1] == 'ISSN' :
try :
self.IssnType = attributes['IssnType']
return
except :
pass
if self.Tags[-3] == "Journal" and self.Tags[-1] == 'JournalIssue':
try :
self.CitedMedium = attributes['CitedMedium']
return
except :
pass
if self.Tags[-2] == 'Abstract' and self.Tags[-1] == 'AbstractText' :
try :
text = attributes['Label']
self.AbstractText = text + ":"
return
except :
pass
if self.Tags[-2] == 'AbstractText' and self.Tags[-1] == 'AbstractText' :
try :
text = attributes['Label']
self.AbstractText = self.AbstractText + " ## " + text + ":"
return
except :
pass
def endElement(self, tag):
global i
self.CurrentData = tag
if self.CurrentData == 'PubmedArticle':
self.Pubmed.append([self.PMID, self.DateCompleted, self.DateRevised, self.ISSN, self.IssnType,
self.CitedMedium, self.date, self.Date_year, self.ArticleType,
self.ISOAbbreviation, self.ArticleTitle, self.Language, self.ELocationID,
self.Author,self.Author_full, self.Affiliation, self.Keywords, self.AbstractText])
# print(self.PMID, self.ISSN, self.date, self.Date_year, self.ArticleType, self.ArticleTitle, self.ELocationID)
i = i + 1
self.init()
if (i % 4000) == 0 :
print("第 %d 条数据" % i)
def characters(self, content) :
global i
names = self.__dict__
if content.strip() == '':
return
if self.Tags[-2] == 'MedlineCitation' and self.Tags[-1] == "PMID":
self.PMID = content
return
if self.Tags[-2] == 'Journal' and self.Tags[-1] == "ISSN":
self.ISSN = content
return
if self.Tags[-2] == "PubDate" and self.Tags[-1] == 'Year':
self.Date_year = content
self.date = self.Date_year
return
if self.Tags[-3] == "PubDate" and self.Tags[-1] == 'Month':
self.Date_month = content
self.date = self.Date_month + '/' + self.Date_year
return
if self.Tags[-4] == "PubDate" and self.Tags[-1] == "Day":
self.Date_day = content
self.date = self.Date_day + '/' + self.Date_month + '/' + self.Date_year
return
if self.Tags[-2] == "DateCompleted" and self.Tags[-1] == "Year":
self.DateCompleted = content
return
if self.Tags[-3] == "DateCompleted" and self.Tags[-1] == "Month":
self.DateCompleted_Month = content
self.DateCompleted = self.DateCompleted_Month+'/'+self.DateCompleted
return
if self.Tags[-4] == "DateCompleted" and self.Tags[-1] == "Day":
self.DateCompleted = content+'/'+self.DateCompleted
return
if self.Tags[-2] == "DateRevised" and self.Tags[-1] == "Year":
self.DateRevised = content
return
if self.Tags[-3] == "DateRevised" and self.Tags[-1] == "Month":
self.DateRevised_Month = content
self.DateRevised = self.DateRevised_Month + "/" + self.DateRevised
return
if self.Tags[-4] == "DateRevised" and self.Tags[-1] == "Day":
self.DateRevised = content+'/'+self.DateRevised
return
if self.Tags[-1] == "Title":
self.ArticleType = content
return
if self.Tags[-1] == "ISOAbbreviation":
self.ISOAbbreviation = content
return
if self.Tags[-1] == "ArticleTitle":
self.ArticleTitle = content
return
if self.Tags[-1] == "ELocationID":
self.ELocationID = content
return
if self.Tags[-2] == 'Abstract' and self.Tags[-1] == 'AbstractText':
if len(self.AbstractText) > 0:
self.AbstractText = self.AbstractText + content
return
else :
self.AbstractText = content
return
if self.Tags[-2] == 'AbstractText' and self.Tags[-1] == 'AbstractText':
self.AbstractText = self.AbstractText + content
return
if self.Tags[-1] == 'Keyword':
if len(self.Keywords) == 0:
self.Keywords = content
return
else :
self.Keywords = self.Keywords + ";" + content
return
if self.Tags[-1] == 'LastName':
self.LastName = content
return
if self.Tags[-1] == 'ForeName':
self.ForeName = content
return
if self.Tags[-1] == 'Initials':
self.Initials = content
if len(self.Author) > 0 :
self.Author = self.Author + "; " + self.LastName + ' ' + self.Initials
self.Author_full = self.Author_full + "; " + self.LastName + ' ' + self.ForeName
return
else :
self.Author = self.LastName + ' ' + self.Initials
self.Author_full = self.LastName + ' ' + self.ForeName
if self.Tags[-1] == 'Affiliation' :
if len(self.Affiliation) > 0 :
self.Affiliation = self.Affiliation + ";" + self.LastName + " " + self.ForeName + ":" + content
return
else :
self.Affiliation = self.LastName + " " + self.ForeName + ":" + content
return
if self.Tags[-1] == 'Language':
self.Language = content
return
def init(self) :
names = self.__dict__
self.CurrentData = ''
self.PMID = ''
self.ISSN = ''
self.date = ''
self.Date_year = ''
self.Date_month = ''
self.Date_day = ''
self.DateCompleted = ''
self.DateCompleted_Month = ''
self.DateRevised_Month = ''
self.DateRevised = ''
self.IssnType = ''
self.CitedMedium = ''
self.ArticleType = ''
self.ISOAbbreviation = ''
self.Journal_Title = ''
self.ArticleTitle = ''
self.ELocationID = ''
self.AbstractText = ''
self.Author = ''
self.Author_full = ''
self.LastName = ''
self.ForeName = ''
self.Initials = ''
self.Identifier = ''
self.Affiliation = ''
self.Keywords = ''
self.Language = ''
self.PublicationType = ''
self.Tags = ['a', 'b', 'c', 'd', 'e']
def xml_parser(file_loca, save_path) :
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 0)
Handler = SaxHandler()
parser.setContentHandler(Handler)
parser.parse(file_loca)
data = Handler.Pubmed
print(len(data))
print(len(data[0]))
columns = ["PMID", "DateCompleted","DateRevised","ISSN", "IssnType", "CitedMedium", "date", "year", "Article_Type", "ISOAbbreviation",
"ArticleTitle", "Language", "ELocationID", "Author","Author_full", "Affiliation", "Keywords", "AbstractText"]
data2 = pd.DataFrame(data, columns=columns)
data2.to_csv(save_path, index=False, encoding='utf-8')