xml文档
28901317
2018
05
10
2018
12
02
1998-4138
13
4
2017
Journal of cancer research and therapeutics
J Cancer Res Ther
k-RAS mutation and resistance to epidermal growth factor receptor-tyrosine kinase inhibitor treatment in patients with nonsmall cell lung cancer.
699-701
10.4103/jcrt.JCRT_468_17
The aim of this study was to evaluate the relationship between k-RAS gene mutation and the resistance to epidermal growth factor receptor-tyrosine kinase inhibitor (EGFR-TKI) treatment in patients with nonsmall-cell lung cancer (NSCLC).
Forty-five pathologies confirmed NSCLC patients who received EGFR-TKI (Gefitinib) treatment were retrospectively included in this study. The mutation of codon 12 and 13, located in exon1 and exon 2 of k-RAS gene were examined by polymerase chain reaction (PCR) and DAN sequencing in tumor samples of the included 45 NSCLC patients. The correlation between Gefitinib treatment response and k-RAS mutation status was analyzed in tumor samples of the 45 NSCLC patients.
Eight tumor samples of the 45 NSCLC patients were found to be mutated in coden 12 or 13, with an mutation rate of 17.8% (8/45); the objective response rate (ORR) was 29.7%(11/37) with 1 cases of complete response (CR) and 10 cases of partial response in k-RAS mutation negative patients. Furthermore, the ORR was 0.0% in k-RAS mutation positive patients with none CR. The ORR between k-RAS mutation and nonmutation patients were significant different (P < 0.05).
k-RAS gene mutation status was associated with the response of Gefitinib treatment in patients with NSCLC.
Zhou
Bin
B
Department of Pharmacy, The First Affiliated Hospital of Wenzhou Medical University, Wenzhou, Zhejiang, Province 325200, PR China.
Tang
Congrong
C
Department of Pharmacy, The First Affiliated Hospital of Wenzhou Medical University, Wenzhou, Zhejiang, Province 325200, PR China.
Li
Jie
J
Department of Pharmacy, Ruian People's Hospital, Ruian, Zhejiang, Province 325200, PR China.
eng
Journal Article
India
J Cancer Res Ther
101249598
1998-4138
0
KRAS protein, human
0
Protein Kinase Inhibitors
0
Quinazolines
EC 2.7.10.1
EGFR protein, human
EC 2.7.10.1
ErbB Receptors
EC 3.6.5.2
Proto-Oncogene Proteins p21(ras)
S65743JHBS
Gefitinib
IM
Adult
Aged
Carcinoma, Non-Small-Cell Lung
drug therapy
genetics
pathology
Drug Resistance, Neoplasm
ErbB Receptors
antagonists & inhibitors
Female
Gefitinib
Humans
Male
Middle Aged
Mutation
Protein Kinase Inhibitors
administration & dosage
Proto-Oncogene Proteins p21(ras)
genetics
Quinazolines
administration & dosage
2017
9
14
6
0
2017
9
14
6
0
2018
5
11
6
0
ppublish
28901317
JCanResTher_2017_13_4_699_214476
10.4103/jcrt.JCRT_468_17
方法一:xml.etree.cElementTre
# -*- coding: utf-8 -*-
"""
@Datetime: 2019/4/25
@Author: Zhang Yafei
"""
import os
import re
import threading
import xml.etree.cElementTree as ET
from concurrent.futures import ThreadPoolExecutor
from itertools import chain
import pandas as pd
def pubmed_xml_parser(path):
dir_name = path.split('\\')[0]
print(dir_name)
etree = ET.parse(path)
root = etree.getroot()
data_list = []
pmid_set = []
for articles in root.iter('PubmedArticle'):
pmid = articles.find('MedlineCitation').find('PMID').text
if pmid in pmid_set:
continue
pmid_set.append(pmid)
Article = articles.find('MedlineCitation').find('Article')
journal = Article.find('Journal').find('ISOAbbreviation').text
try:
authors = Article.find('AuthorList').findall('Author')
affiliations_info = set()
for author in authors:
# author_name = author.find('LastName').text + ' ' + author.find('ForeName').text
affiliations = [x.find('Affiliation').text for x in author.findall('AffiliationInfo')]
# author = author_name + ':' + ';'.join(affiliations)
for affiliation in affiliations:
affiliations_info.add(affiliation)
affiliations_info = ';'.join(affiliations_info)
except AttributeError:
affiliations_info = ''
try:
date = Article.find('Journal').find('JournalIssue').find('PubDate').find('Year').text
except AttributeError:
date = Article.find('Journal').find('JournalIssue').find('PubDate').find('MedlineDate').text
date = re.search('\d+', date).group(0)
try:
mesh_words = []
for mesh_heading in articles.find('MedlineCitation').find('MeshHeadingList').findall('MeshHeading'):
if len(list(mesh_heading)) == 1:
mesh_words.append(list(mesh_heading)[0].text)
continue
mesh_name = ''
for mesh in mesh_heading:
if mesh.tag == 'DescriptorName':
mesh_name = mesh.text
continue
if mesh_name and mesh.tag == 'QualifierName':
mesh_word = mesh_name + '/' + mesh.text
mesh_words.append(mesh_word)
mesh_words = ';'.join(mesh_words)
except AttributeError:
print(articles.find('MedlineCitation').find('PMID').text)
mesh_words = ''
article_type = '/'.join([x.text for x in Article.find('PublicationTypeList').getchildren()])
country = articles.find('MedlineCitation').find('MedlineJournalInfo').find('Country').text
data_list.append(
{'PMID': pmid, 'journal': journal, 'affiliations_info': affiliations_info, 'pub_year': date,
'mesh_words': mesh_words,
'country': country, 'article_type': article_type, 'file_path': path})
print(pmid + '\t解析完成')
df = pd.DataFrame(data_list)
with threading.Lock():
df.to_csv('{}.csv'.format(dir_name), encoding='utf_8_sig', mode='a', index=False, header=False)
def to_excel(data, path):
writer = pd.ExcelWriter(path)
data.to_excel(writer, sheet_name='table', index=False)
writer.save()
def get_files_path():
for base_path, folders, files in os.walk('first in class drug'):
file_list = [os.path.join(base_path, file) for file in files if file.endswith('.xml')]
for base_path, folders, files in os.walk('follow on drug'):
file_list.extend([os.path.join(base_path, file) for file in files if file.endswith('.xml')])
for base_path, folders, files in os.walk('me too drug'):
file_list.extend([os.path.join(base_path, file) for file in files if file.endswith('.xml')])
if os.path.exists('first in class drug.csv') or os.path.exists('follow on drug.csv') or os.path.exists(
'me too drug.csv'):
if os.path.exists('first in class drug.csv'):
df = pd.read_csv('first in class drug.csv', encoding='utf-8')
has_files_list = df.file_path.tolist()
if os.path.exists('follow on drug.csv'):
df = pd.read_csv('follow on drug.csv', encoding='utf-8')
has_files_list = chain(has_files_list, df.file_path.tolist())
if os.path.exists('me too drug.csv'):
df = pd.read_csv('me too drug.csv', encoding='utf-8')
has_files_list = chain(has_files_list, df.file_path.tolist())
print('共需解析文件:{0}'.format(len(file_list)))
has_files_list = set(has_files_list)
file_list = set(file_list) - has_files_list
print('已解析文件:{0}'.format(len(has_files_list)))
else:
df = pd.DataFrame(
columns=['PMID', 'affiliations_info', 'article_type', 'country', 'file_path', 'journal', 'mesh_words',
'pub_year'])
df.to_csv('follow on drug.csv', encoding='utf_8_sig', index=False)
df.to_csv('first in class drug.csv', encoding='utf_8_sig', index=False)
df.to_csv('me too drug.csv', encoding='utf_8_sig', index=False)
print('共需解析文件:{0}'.format(len(file_list)))
print('已解析文件:0')
return file_list
if __name__ == '__main__':
files_list = get_files_path()
if not files_list:
print('全部解析完成')
else:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:
pool.map(pubmed_xml_parser, files_list)
方法二:lxml+xpath
# -*- coding: utf-8 -*-
"""
@Datetime: 2019/4/26
@Author: Zhang Yafei
"""
import os
import re
import threading
from concurrent.futures import ThreadPoolExecutor
from lxml import etree
import pandas as pd
def pubmed_xpath_parse(path):
tree = etree.parse(path)
# 如果xml数据中出现了关于dtd的声明(如下面的例子),那样的话,必须在使用lxml解析xml的时候,进行相应的声明。
# parser = etree.XMLParser(load_dtd=True) # 首先根据dtd得到一个parser(注意dtd文件要放在和xml文件相同的目录)
# tree = etree.parse('1.xml', parser=parser) # 用上面得到的parser将xml解析为树结构
data_list = []
pmid_set = []
for articles in tree.xpath('//PubmedArticle'):
# pmid = articles.xpath('MedlineCitation/PMID')[0].xpath('string()')
pmid = articles.xpath('MedlineCitation/PMID/text()')[0]
if pmid in pmid_set:
continue
pmid_set.append(pmid)
Article = articles.xpath('MedlineCitation/Article')[0]
journal = Article.xpath('Journal/ISOAbbreviation/text()')[0]
try:
authors = Article.xpath('AuthorList/Author')
affiliations_info = set()
for author in authors:
# author_name = author.find('LastName').text + ' ' + author.find('ForeName').text
affiliations = [x.xpath('Affiliation/text()')[0] for x in author.xpath('AffiliationInfo')]
# author = author_name + ':' + ';'.join(affiliations)
for affiliation in affiliations:
affiliations_info.add(affiliation)
affiliations_info = ';'.join(affiliations_info)
except AttributeError:
affiliations_info = ''
try:
date = Article.xpath('Journal/JournalIssue/PubDate/Year/text()')[0]
except IndexError:
date = Article.xpath('Journal/JournalIssue/PubDate/MedlineDate/text()')[0]
date = re.search('\d+', date).group(0)
try:
mesh_words = []
for mesh_heading in articles.xpath('MedlineCitation/MeshHeadingList/MeshHeading'):
if len(mesh_heading.xpath('child::*')) == 1:
mesh_words.append((mesh_heading.xpath('child::*'))[0].text)
continue
mesh_name = ''
for mesh in mesh_heading.xpath('child::*'):
if mesh.tag == 'DescriptorName':
mesh_name = mesh.xpath('string()')
continue
if mesh_name and mesh.tag == 'QualifierName':
mesh_word = mesh_name + '/' + mesh.xpath('string()')
mesh_words.append(mesh_word)
mesh_words = ';'.join(mesh_words)
except AttributeError:
mesh_words = ''
article_type = '/'.join([x.xpath('./text()')[0] for x in Article.xpath('PublicationTypeList/PublicationType')])
country = articles.xpath('MedlineCitation/MedlineJournalInfo/Country/text()')[0]
data_list.append(
{'PMID': pmid, 'journal': journal, 'affiliations_info': affiliations_info, 'pub_year': date,
'mesh_words': mesh_words,
'country': country, 'article_type': article_type, 'file_path': path})
print(pmid + '\t解析完成')
df = pd.DataFrame(data_list)
with threading.Lock():
df.to_csv('pubmed.csv', encoding='utf_8_sig', mode='a', index=False, header=False)
def to_excel(data, path):
writer = pd.ExcelWriter(path)
data.to_excel(writer, sheet_name='table', index=False)
writer.save()
def get_files_path():
for base_path, folders, files in os.walk('first in class drug'):
file_list = [os.path.join(base_path, file) for file in files if file.endswith('.xml')]
for base_path, folders, files in os.walk('follow on drug'):
file_list.extend([os.path.join(base_path, file) for file in files if file.endswith('.xml')])
for base_path, folders, files in os.walk('me too drug'):
file_list.extend([os.path.join(base_path, file) for file in files if file.endswith('.xml')])
if os.path.exists('pubmed.csv'):
df = pd.read_csv('pubmed.csv', encoding='utf-8')
has_files_list = df.file_path
print('共需解析文件:{0}'.format(len(file_list)))
file_list = set(file_list) - set(has_files_list)
print('已解析文件:{0}'.format(len(set(has_files_list))))
else:
df = pd.DataFrame(columns=['PMID','affiliations_info','article_type','country','file_path','journal','mesh_words','pub_year'])
df.to_csv('pubmed.csv', encoding='utf_8_sig', index=False)
print('共需解析文件:{0}'.format(len(file_list)))
print('已解析文件:0')
return file_list
if __name__ == '__main__':
files_list = get_files_path()
if not files_list:
print('全部解析完成')
else:
pool = ThreadPoolExecutor(max_workers=os.cpu_count())
pool.map(pubmed_xpath_parse, files_list)