python自用

python自用


from lxml import etree
import requests
import re
import time
import pandas as pd
from bs4 import BeautifulSoup as bs


def url_one(main_url):
    resp = requests.get(main_url)
    obj1 = re.compile(r"发展动态.*?
  • (?P
      .*?)
    ", re.S) result1 = obj1.finditer(resp.text) ##查找 #result1 = obj1.search(resp.text) ul = result1.group("ul") obj2 = re.compile(r"\">(?P.*?)基金", re.S) obj3 = re.compile(r".*?)\">", re.S) code_names_ = obj2.finditer(ul) hrefs = obj3.finditer(ul) code_name_list = [] href_list = [] for code_names in code_names_: code_name = code_names.group("code") code_name_list.append(code_name) for it_href in hrefs: href = it_href.group("href") href_list.append(href) resp.close() return code_name_list, href_list def url_two(url2): hrefs_2_list = [] obj_url2 = re.compile(r"&\">首页(?P
      .*?)
  • ", re.S) obj4 = re.compile(r".*?)\"", re.S) resp2 = requests.get(url2) hrefs_2_ = obj_url2.finditer(resp2.text) for hrefs_2 in hrefs_2_: href_2 = hrefs_2.group("ul") href_2_urls_ = obj4.finditer(href_2) for href_2_urls in href_2_urls_: href_2_url = href_2_urls.group("href") href_2_url = href_2_url.replace("&", "&") hrefs_2_list.append(href_2_url) hrefs_2_list = sorted(list(set(hrefs_2_list))) resp2.close() return hrefs_2_list def url_three(url3): url = url3 obj2 = re.compile(r".*?)\" target", re.S) obj1 = re.compile(r"picnews_list.*?
  • (?P
      .*?)
    ", re.S) resp = requests.get(url) result1 = obj1.search(resp.text) ul = result1.group("ul") hrefs_ = obj2.finditer(ul) href_list = [] for hrefs in hrefs_: href = hrefs.group("href") href = href.replace("&", "&") href_list.append(href) resp.close() return href_list def url_four(url, df, code_name): date = "" com_name = "" mony = "" # obj2 = re.compile(r"
    .*?)\" target", re.S) # obj1 = re.compile(r"picnews_list.*?
  • (?P
      .*?)
    ", re.S) resp = requests.get(url) html = etree.HTML(resp.text) fileName = html.xpath("/html/body/div[4]/div[2]/div[2]/b/text()")[0] pro_name = "" if "公示" in fileName: print(fileName, url) pass else: try: page = bs(resp.text, "html.parser") table = page.find("table").find_all("tr") i = 0 for tr in table: i += 1 tds = tr.find_all("td") if i == 1: pro_name = tds[0].text if i >= 4: date = tds[0].text com_name = tds[1].text mony = tds[2].text print(date, com_name, mony) df = my_df(df, date, com_name, mony, pro_name, fileName, url, code_name) else: pro_name = "表格为图片无法识别" df = my_df(df, date, com_name, mony, pro_name, fileName, url, code_name) pass except: pass resp.close() return df def my_df(df, date, com_name, mony, pro_name, fileName, url, code_name): my_judge = "" com_name = com_name.split("(")[0] if "公司" in com_name: my_judge = "企业" elif "街道办" in com_name or "局" in com_name: my_judge = "政府" elif "村委会" in com_name: my_judge = "群众自治组织" elif "基金" in com_name: my_judge = "非盈利组织" else: my_judge = "个人" my_se = pd.Series({"日期": date, "收支来源": com_name, "收入金额": mony, "收入类型": my_judge, "专项基金名称": pro_name , "专项基金名称G": fileName, "网址": url}, name=code_name) df = df.append(my_se) time.sleep(1) print(df) return df if __name__ == "__main__": do_main_url = "" main_url = "" excel_path = r"C:\wq1.xlsx" code_name_list, href_list_one = url_one(main_url) df = pd.DataFrame() for i, href in enumerate(href_list_one): url2 = main_url + href code_name = code_name_list[i] href_list_two = url_two(url2) for href_two in href_list_two: url3 = do_main_url + href_two href_three_list = url_three(url3) for href_three in href_three_list: url4 = do_main_url + href_three df = url_four(url4, df, code_name) df.to_excel(excel_path)
  • 你可能感兴趣的:(python,1024程序员节)