python lxml 获取企业信息

#coding=gbk


import numpy as np
import pandas as pd

from lxml import etree
import csv
import requests

col = "ABCDEFGHIJKLM"
row = 1

nameArray1 = np.array([])
nameArray3 = np.array([])
nameArray4 = np.array([])
nameArray5 = np.array([])
nameArray6 = np.array([])
nameArray7 = np.array([])

nameArray8 = np.array([])
nameArray10 = np.array([])
nameArray11 = np.array([])
nameArray12 = np.array([])
nameArray13 = np.array([])
nameArray14 = np.array([])
nameArray15 = np.array([])
nameArray16 = np.array([])
nameArray17 = np.array([])

nameArrayEnd1 = np.array([])
nameArrayEnd3 = np.array([])


def getData(idUrl):

    global nameArray1
    global nameArray3
    global nameArray4
    global nameArray5
    global nameArray6
    global nameArray7
    
    global nameArray8
    global nameArray10
    global nameArray11
    global nameArray12
    global nameArray13
    global nameArray14
    global nameArray15
    global nameArray16
    global nameArray17
    
    global nameArrayEnd1
    global nameArrayEnd3
    
    
    
    ht = requests.get(url = idUrl)
    html=etree.HTML(ht.text)
    #html = etree.parse(txt, etree.HTMLParser(encoding='gbk'))
    
    #res = html.xpath('//table[@class ="detailTable"]/tbody/tr//td')
    res = html.xpath('//td')
    faren = res[14].text
    res = html.xpath('//td[@align = "center"]')

    i = 0
    end = 2000




    '''
    nameArray1 = np.append(nameArray1, res[1].text.replace(' ', ''))
    nameArray3 = np.append(nameArray3, res[3].text.replace(' ', ''))
    nameArray4 = np.append(nameArray4, res[4].text.replace(' ', ''))
    nameArray5 = np.append(nameArray5, res[5].text.replace(' ', ''))
    nameArray6 = np.append(nameArray6, res[6].text.replace(' ', ''))
    nameArray7 = np.append(nameArray7, faren.replace(' ', ''))
    '''


    for a in res:
        print(i)
        i += 1

        if a.text is not None:
            astr = a.text.replace(' ', '')
            #astr
            print(astr)

            if astr== ('安全许可信息'):
                end = i
        if i > end + 3:
            break
    print("eeeeeeeeeeeeeeeeeeee" + str(end))
    i = 7
    for m in range(int(end/10)):
        
        if m == 0:
            nameArray1 = np.append(nameArray1, res[1].text.replace(' ', ''))
            nameArray3 = np.append(nameArray3, res[3].text.replace(' ', ''))
            nameArray4 = np.append(nameArray4, res[4].text.replace(' ', ''))
            nameArray5 = np.append(nameArray5, res[5].text.replace(' ', ''))
            nameArray6 = np.append(nameArray6, res[6].text.replace(' ', ''))
            nameArray7 = np.append(nameArray7, faren.replace(' ', ''))

            nameArrayEnd1 = np.append(nameArrayEnd1, res[end + 1].text.replace(' ', ''))
            nameArrayEnd3 = np.append(nameArrayEnd3, res[end + 3].text.replace(' ', ''))
        else:
            nameArray1 = np.append(nameArray1, '')
            nameArray3 = np.append(nameArray3, '')
            nameArray4 = np.append(nameArray4, '')
            nameArray5 = np.append(nameArray5, '')
            nameArray6 = np.append(nameArray6, '')
            nameArray7 = np.append(nameArray7, '')

            nameArrayEnd1 = np.append(nameArrayEnd1, '')
            nameArrayEnd3 = np.append(nameArrayEnd3, '')

        i = i + 1
        nameArray8 = np.append(nameArray8, res[i].text.replace(' ', ''))
        i = i + 2
        nameArray10 = np.append(nameArray10, res[i].text.replace(' ', ''))
        i = i + 1
        nameArray11 = np.append(nameArray11, res[i].text.replace(' ', ''))
        i = i + 1
        nameArray12 = np.append(nameArray12, res[i].text.replace(' ', ''))
        i = i + 1
        nameArray13 = np.append(nameArray13, res[i].text.replace(' ', ''))
        i = i + 1
        nameArray14 = np.append(nameArray14, res[i].text.replace(' ', ''))
        i = i + 1
        nameArray15 = np.append(nameArray15, res[i].text.replace(' ', ''))
        i = i + 1
       
        nameArray16 = np.append(nameArray16, res[i].text.replace(' ', ''))
        
        i = i + 1

        nameArray17 = np.append(nameArray17, res[i].text.replace(' ', ''))

        
    writefile = "test2.csv"
    data = [nameArray1, nameArray3, nameArray4, nameArray5, nameArray6, nameArray7, \
                nameArray8, nameArray10, nameArray11, nameArray12, nameArray13, nameArray14, \
                nameArray15, nameArray16, nameArray17, nameArrayEnd1, nameArrayEnd3]
    data = np.transpose(data)
    ser2 = pd.DataFrame(data, columns=['企业名称', '营业证', '地址', '注册资本', '组织机构代码号', '法人', \
                                           '序号', '编号', '日期', '状态', '资质序列', '类别', '等级', '发证单位', '核准日期', '证书', '有效期'])
    ser2 = pd.DataFrame(data)
    print("write file")
    ser2.to_csv(writefile, encoding="utf_8_sig")


import sys

def run():
    yema = sys.argv[1]
    if int(yema) > 3:
        print("> 3")
        return

    url = "http://124.115.170.171:7001/PDR/network/informationSearch/informationSearchList?&pageNumber=" + str(yema)


    res = requests.get(url = url)

    #print(res.text)
    html=etree.HTML(res.text)

    aRes = html.xpath('//td/p/a/@onclick')
    t = 0
    for a in enumerate(aRes):

        #print(type(a[1]))
        s = a[1].find(',')
        
        aid = a[1][s+2:]
        print(a[1])
        e = aid.find(',')
        aid = aid[:e-2]
        print(aid)

        idUrl = 'http://124.115.170.171:7001/PDR/network/Enterprise/Informations/view?enid=' + aid


        getData(idUrl)
        
if __name__ == '__main__':
    run()

    


 

你可能感兴趣的:(python,python,xpath)