Python解析xml文档实战案例

xml文档





    
        28901317
        
            2018
            05
            10
        
        
            2018
            12
            02
        
        
1998-4138 13 4 2017 Journal of cancer research and therapeutics J Cancer Res Ther k-RAS mutation and resistance to epidermal growth factor receptor-tyrosine kinase inhibitor treatment in patients with nonsmall cell lung cancer. 699-701 10.4103/jcrt.JCRT_468_17 The aim of this study was to evaluate the relationship between k-RAS gene mutation and the resistance to epidermal growth factor receptor-tyrosine kinase inhibitor (EGFR-TKI) treatment in patients with nonsmall-cell lung cancer (NSCLC). Forty-five pathologies confirmed NSCLC patients who received EGFR-TKI (Gefitinib) treatment were retrospectively included in this study. The mutation of codon 12 and 13, located in exon1 and exon 2 of k-RAS gene were examined by polymerase chain reaction (PCR) and DAN sequencing in tumor samples of the included 45 NSCLC patients. The correlation between Gefitinib treatment response and k-RAS mutation status was analyzed in tumor samples of the 45 NSCLC patients. Eight tumor samples of the 45 NSCLC patients were found to be mutated in coden 12 or 13, with an mutation rate of 17.8% (8/45); the objective response rate (ORR) was 29.7%(11/37) with 1 cases of complete response (CR) and 10 cases of partial response in k-RAS mutation negative patients. Furthermore, the ORR was 0.0% in k-RAS mutation positive patients with none CR. The ORR between k-RAS mutation and nonmutation patients were significant different (P < 0.05). k-RAS gene mutation status was associated with the response of Gefitinib treatment in patients with NSCLC. Zhou Bin B Department of Pharmacy, The First Affiliated Hospital of Wenzhou Medical University, Wenzhou, Zhejiang, Province 325200, PR China. Tang Congrong C Department of Pharmacy, The First Affiliated Hospital of Wenzhou Medical University, Wenzhou, Zhejiang, Province 325200, PR China. Li Jie J Department of Pharmacy, Ruian People's Hospital, Ruian, Zhejiang, Province 325200, PR China. eng Journal Article
India J Cancer Res Ther 101249598 1998-4138 0 KRAS protein, human 0 Protein Kinase Inhibitors 0 Quinazolines EC 2.7.10.1 EGFR protein, human EC 2.7.10.1 ErbB Receptors EC 3.6.5.2 Proto-Oncogene Proteins p21(ras) S65743JHBS Gefitinib IM Adult Aged Carcinoma, Non-Small-Cell Lung drug therapy genetics pathology Drug Resistance, Neoplasm ErbB Receptors antagonists & inhibitors Female Gefitinib Humans Male Middle Aged Mutation Protein Kinase Inhibitors administration & dosage Proto-Oncogene Proteins p21(ras) genetics Quinazolines administration & dosage
2017 9 14 6 0 2017 9 14 6 0 2018 5 11 6 0 ppublish 28901317 JCanResTher_2017_13_4_699_214476 10.4103/jcrt.JCRT_468_17

  方法一:xml.etree.cElementTre

# -*- coding: utf-8 -*-

"""
@Datetime: 2019/4/25
@Author: Zhang Yafei
"""
import os
import re
import threading
import xml.etree.cElementTree as ET
from concurrent.futures import ThreadPoolExecutor
from itertools import chain

import pandas as pd


def pubmed_xml_parser(path):
    dir_name = path.split('\\')[0]
    print(dir_name)
    etree = ET.parse(path)
    root = etree.getroot()
    data_list = []
    pmid_set = []
    for articles in root.iter('PubmedArticle'):
        pmid = articles.find('MedlineCitation').find('PMID').text
        if pmid in pmid_set:
            continue
        pmid_set.append(pmid)
        Article = articles.find('MedlineCitation').find('Article')
        journal = Article.find('Journal').find('ISOAbbreviation').text
        try:
            authors = Article.find('AuthorList').findall('Author')
            affiliations_info = set()
            for author in authors:
                # author_name = author.find('LastName').text + ' ' + author.find('ForeName').text
                affiliations = [x.find('Affiliation').text for x in author.findall('AffiliationInfo')]
                # author = author_name + ':' + ';'.join(affiliations)
                for affiliation in affiliations:
                    affiliations_info.add(affiliation)
            affiliations_info = ';'.join(affiliations_info)
        except AttributeError:
            affiliations_info = ''
        try:
            date = Article.find('Journal').find('JournalIssue').find('PubDate').find('Year').text
        except AttributeError:
            date = Article.find('Journal').find('JournalIssue').find('PubDate').find('MedlineDate').text
            date = re.search('\d+', date).group(0)
        try:
            mesh_words = []
            for mesh_heading in articles.find('MedlineCitation').find('MeshHeadingList').findall('MeshHeading'):
                if len(list(mesh_heading)) == 1:
                    mesh_words.append(list(mesh_heading)[0].text)
                    continue
                mesh_name = ''
                for mesh in mesh_heading:
                    if mesh.tag == 'DescriptorName':
                        mesh_name = mesh.text
                        continue
                    if mesh_name and mesh.tag == 'QualifierName':
                        mesh_word = mesh_name + '/' + mesh.text
                        mesh_words.append(mesh_word)
            mesh_words = ';'.join(mesh_words)
        except AttributeError:
            print(articles.find('MedlineCitation').find('PMID').text)
            mesh_words = ''
        article_type = '/'.join([x.text for x in Article.find('PublicationTypeList').getchildren()])
        country = articles.find('MedlineCitation').find('MedlineJournalInfo').find('Country').text
        data_list.append(
            {'PMID': pmid, 'journal': journal, 'affiliations_info': affiliations_info, 'pub_year': date,
             'mesh_words': mesh_words,
             'country': country, 'article_type': article_type, 'file_path': path})
        print(pmid + '\t解析完成')
    df = pd.DataFrame(data_list)
    with threading.Lock():
        df.to_csv('{}.csv'.format(dir_name), encoding='utf_8_sig', mode='a', index=False, header=False)


def to_excel(data, path):
    writer = pd.ExcelWriter(path)
    data.to_excel(writer, sheet_name='table', index=False)
    writer.save()


def get_files_path():
    for base_path, folders, files in os.walk('first in class drug'):
        file_list = [os.path.join(base_path, file) for file in files if file.endswith('.xml')]
    for base_path, folders, files in os.walk('follow on drug'):
        file_list.extend([os.path.join(base_path, file) for file in files if file.endswith('.xml')])
    for base_path, folders, files in os.walk('me too drug'):
        file_list.extend([os.path.join(base_path, file) for file in files if file.endswith('.xml')])
    if os.path.exists('first in class drug.csv') or os.path.exists('follow on drug.csv') or os.path.exists(
            'me too drug.csv'):
        if os.path.exists('first in class drug.csv'):
            df = pd.read_csv('first in class drug.csv', encoding='utf-8')
            has_files_list = df.file_path.tolist()
        if os.path.exists('follow on drug.csv'):
            df = pd.read_csv('follow on drug.csv', encoding='utf-8')
            has_files_list = chain(has_files_list, df.file_path.tolist())
        if os.path.exists('me too drug.csv'):
            df = pd.read_csv('me too drug.csv', encoding='utf-8')
            has_files_list = chain(has_files_list, df.file_path.tolist())
        print('共需解析文件:{0}'.format(len(file_list)))
        has_files_list = set(has_files_list)
        file_list = set(file_list) - has_files_list
        print('已解析文件:{0}'.format(len(has_files_list)))
    else:
        df = pd.DataFrame(
            columns=['PMID', 'affiliations_info', 'article_type', 'country', 'file_path', 'journal', 'mesh_words',
                     'pub_year'])
        df.to_csv('follow on drug.csv', encoding='utf_8_sig', index=False)
        df.to_csv('first in class drug.csv', encoding='utf_8_sig', index=False)
        df.to_csv('me too drug.csv', encoding='utf_8_sig', index=False)
        print('共需解析文件:{0}'.format(len(file_list)))
        print('已解析文件:0')
    return file_list


if __name__ == '__main__':
    files_list = get_files_path()
    if not files_list:
        print('全部解析完成')
    else:
        with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:
            pool.map(pubmed_xml_parser, files_list)

  方法二:lxml+xpath

# -*- coding: utf-8 -*-

"""
@Datetime: 2019/4/26
@Author: Zhang Yafei
"""
import os
import re
import threading
from concurrent.futures import ThreadPoolExecutor

from lxml import etree
import pandas as pd


def pubmed_xpath_parse(path):
    tree = etree.parse(path)
    # 如果xml数据中出现了关于dtd的声明(如下面的例子),那样的话,必须在使用lxml解析xml的时候,进行相应的声明。
    # parser = etree.XMLParser(load_dtd=True)  # 首先根据dtd得到一个parser(注意dtd文件要放在和xml文件相同的目录)
    # tree = etree.parse('1.xml', parser=parser)  # 用上面得到的parser将xml解析为树结构
    data_list = []
    pmid_set = []
    for articles in tree.xpath('//PubmedArticle'):
        # pmid = articles.xpath('MedlineCitation/PMID')[0].xpath('string()')
        pmid = articles.xpath('MedlineCitation/PMID/text()')[0]
        if pmid in pmid_set:
            continue
        pmid_set.append(pmid)
        Article = articles.xpath('MedlineCitation/Article')[0]
        journal = Article.xpath('Journal/ISOAbbreviation/text()')[0]
        try:
            authors = Article.xpath('AuthorList/Author')
            affiliations_info = set()
            for author in authors:
                # author_name = author.find('LastName').text + ' ' + author.find('ForeName').text
                affiliations = [x.xpath('Affiliation/text()')[0] for x in author.xpath('AffiliationInfo')]
                # author = author_name + ':' + ';'.join(affiliations)
                for affiliation in affiliations:
                    affiliations_info.add(affiliation)
            affiliations_info = ';'.join(affiliations_info)
        except AttributeError:
            affiliations_info = ''
        try:
            date = Article.xpath('Journal/JournalIssue/PubDate/Year/text()')[0]
        except IndexError:
            date = Article.xpath('Journal/JournalIssue/PubDate/MedlineDate/text()')[0]
            date = re.search('\d+', date).group(0)
        try:
            mesh_words = []
            for mesh_heading in articles.xpath('MedlineCitation/MeshHeadingList/MeshHeading'):
                if len(mesh_heading.xpath('child::*')) == 1:
                    mesh_words.append((mesh_heading.xpath('child::*'))[0].text)
                    continue
                mesh_name = ''
                for mesh in mesh_heading.xpath('child::*'):
                    if mesh.tag == 'DescriptorName':
                        mesh_name = mesh.xpath('string()')
                        continue
                    if mesh_name and mesh.tag == 'QualifierName':
                        mesh_word = mesh_name + '/' + mesh.xpath('string()')
                        mesh_words.append(mesh_word)
            mesh_words = ';'.join(mesh_words)
        except AttributeError:
            mesh_words = ''
        article_type = '/'.join([x.xpath('./text()')[0] for x in Article.xpath('PublicationTypeList/PublicationType')])
        country = articles.xpath('MedlineCitation/MedlineJournalInfo/Country/text()')[0]
        data_list.append(
            {'PMID': pmid, 'journal': journal, 'affiliations_info': affiliations_info, 'pub_year': date,
             'mesh_words': mesh_words,
             'country': country, 'article_type': article_type, 'file_path': path})
        print(pmid + '\t解析完成')
        df = pd.DataFrame(data_list)
        with threading.Lock():
            df.to_csv('pubmed.csv', encoding='utf_8_sig', mode='a', index=False, header=False)


def to_excel(data, path):
    writer = pd.ExcelWriter(path)
    data.to_excel(writer, sheet_name='table', index=False)
    writer.save()


def get_files_path():
    for base_path, folders, files in os.walk('first in class drug'):
        file_list = [os.path.join(base_path, file) for file in files if file.endswith('.xml')]
    for base_path, folders, files in os.walk('follow on drug'):
        file_list.extend([os.path.join(base_path, file) for file in files if file.endswith('.xml')])
    for base_path, folders, files in os.walk('me too drug'):
        file_list.extend([os.path.join(base_path, file) for file in files if file.endswith('.xml')])
    if os.path.exists('pubmed.csv'):
        df = pd.read_csv('pubmed.csv', encoding='utf-8')
        has_files_list = df.file_path
        print('共需解析文件:{0}'.format(len(file_list)))
        file_list = set(file_list) - set(has_files_list)
        print('已解析文件:{0}'.format(len(set(has_files_list))))
    else:
        df = pd.DataFrame(columns=['PMID','affiliations_info','article_type','country','file_path','journal','mesh_words','pub_year'])
        df.to_csv('pubmed.csv', encoding='utf_8_sig', index=False)
        print('共需解析文件:{0}'.format(len(file_list)))
        print('已解析文件:0')
    return file_list


if __name__ == '__main__':
    files_list = get_files_path()
    if not files_list:
        print('全部解析完成')
    else:
        pool = ThreadPoolExecutor(max_workers=os.cpu_count())
        pool.map(pubmed_xpath_parse, files_list)

  

 

 

 

 

  

转载于:https://www.cnblogs.com/zhangyafei/p/10776698.html

你可能感兴趣的:(Python解析xml文档实战案例)