python自用
from lxml import etree
import requests
import re
import time
import pandas as pd
from bs4 import BeautifulSoup as bs
def url_one(main_url):
resp = requests.get(main_url)
obj1 = re.compile(r"发展动态.*?(?P", re.S)
result1 = obj1.finditer(resp.text)
ul = result1.group("ul")
obj2 = re.compile(r"\">(?P.*?)基金"
, re.S)
obj3 = re.compile(r".*?)\">", re.S)
code_names_ = obj2.finditer(ul)
hrefs = obj3.finditer(ul)
code_name_list = []
href_list = []
for code_names in code_names_:
code_name = code_names.group("code")
code_name_list.append(code_name)
for it_href in hrefs:
href = it_href.group("href")
href_list.append(href)
resp.close()
return code_name_list, href_list
def url_two(url2):
hrefs_2_list = []
obj_url2 = re.compile(r"&\">首页(?P
"
, re
.S
)
obj4
= re
.compile(r".*?)\"", re
.S
)
resp2
= requests
.get
(url2
)
hrefs_2_
= obj_url2
.finditer
(resp2
.text
)
for hrefs_2
in hrefs_2_
:
href_2
= hrefs_2
.group
("ul")
href_2_urls_
= obj4
.finditer
(href_2
)
for href_2_urls
in href_2_urls_
:
href_2_url
= href_2_urls
.group
("href")
href_2_url
= href_2_url
.replace
("&", "&")
hrefs_2_list
.append
(href_2_url
)
hrefs_2_list
= sorted(list(set(hrefs_2_list
)))
resp2
.close
()
return hrefs_2_list
def url_three(url3
):
url
= url3
obj2
= re
.compile(r".*?)\" target", re
.S
)
obj1
= re
.compile(r"picnews_list.*?(?P", re
.S
)
resp
= requests
.get
(url
)
result1
= obj1
.search
(resp
.text
)
ul
= result1
.group
("ul")
hrefs_
= obj2
.finditer
(ul
)
href_list
= []
for hrefs
in hrefs_
:
href
= hrefs
.group
("href")
href
= href
.replace
("&", "&")
href_list
.append
(href
)
resp
.close
()
return href_list
def url_four(url
, df
, code_name
):
date
= ""
com_name
= ""
mony
= ""
resp
= requests
.get
(url
)
html
= etree
.HTML
(resp
.text
)
fileName
= html
.xpath
("/html/body/div[4]/div[2]/div[2]/b/text()")[0]
pro_name
= ""
if "公示" in fileName
:
print(fileName
, url
)
pass
else:
try:
page
= bs
(resp
.text
, "html.parser")
table
= page
.find
("table").find_all
("tr")
i
= 0
for tr
in table
:
i
+= 1
tds
= tr
.find_all
("td")
if i
== 1:
pro_name
= tds
[0].text
if i
>= 4:
date
= tds
[0].text
com_name
= tds
[1].text
mony
= tds
[2].text
print(date
, com_name
, mony
)
df
= my_df
(df
, date
, com_name
, mony
, pro_name
, fileName
, url
, code_name
)
else:
pro_name
= "表格为图片无法识别"
df
= my_df
(df
, date
, com_name
, mony
, pro_name
, fileName
, url
, code_name
)
pass
except:
pass
resp
.close
()
return df
def my_df(df
, date
, com_name
, mony
, pro_name
, fileName
, url
, code_name
):
my_judge
= ""
com_name
= com_name
.split
("(")[0]
if "公司" in com_name
:
my_judge
= "企业"
elif "街道办" in com_name
or "局" in com_name
:
my_judge
= "政府"
elif "村委会" in com_name
:
my_judge
= "群众自治组织"
elif "基金" in com_name
:
my_judge
= "非盈利组织"
else:
my_judge
= "个人"
my_se
= pd
.Series
({"日期": date
, "收支来源": com_name
, "收入金额": mony
, "收入类型": my_judge
, "专项基金名称": pro_name
, "专项基金名称G": fileName
, "网址": url
}, name
=code_name
)
df
= df
.append
(my_se
)
time
.sleep
(1)
print(df
)
return df
if __name__
== "__main__":
do_main_url
= ""
main_url
= ""
excel_path
= r"C:\wq1.xlsx"
code_name_list
, href_list_one
= url_one
(main_url
)
df
= pd
.DataFrame
()
for i
, href
in enumerate(href_list_one
):
url2
= main_url
+ href
code_name
= code_name_list
[i
]
href_list_two
= url_two
(url2
)
for href_two
in href_list_two
:
url3
= do_main_url
+ href_two
href_three_list
= url_three
(url3
)
for href_three
in href_three_list
:
url4
= do_main_url
+ href_three
df
= url_four
(url4
, df
, code_name
)
df
.to_excel
(excel_path
)