python爬虫爬取政府网站关键字

**

功能介绍

**
获取政府招标内容包含以下关键词,就提取该标书内容保存(本地文本)1,汽车采购2、汽车租赁3、公务车4、公务车租赁5、汽车协议供货6、汽车7、租赁
爬取网站
http://www.lxggzyjy.com/f/newtrade/annogoods/list?selectedProjectType=2
作者:
speed_zombie
版本信息:
python v3.7.4
运行:
python web_purchase.py
解析结果:
解析完成后会获取采购招标.txt文件**,主要运用BeautifulSoup来解析网页的数据,BeautifulSoup是款非常好用的解析插件

废话不多上代码

# coding=UTF-8
import requests
from bs4 import BeautifulSoup

host = "http://www.lxggzyjy.com"
url = 'http://www.lxggzyjy.com/f/newtrade/annogoods/getAnnoList' #政府采购
keys=["汽车采购","汽车租赁","公务车","公务车租赁","汽车协议供货","汽车","租赁"]
dir_root = "ceshi" #文件要存放的根目录
datas = {
    "pageNo": 0,
    "pageSize": 11120,
    "tradeStatus": 0,
    "prjpropertyid": "21,22,23,24",
    # "tradeArea": 3025,
    # "tradeArea": 3026,
    # "tradeArea": 3027,
    # "tradeArea": 3028,
    # "tradeArea": 3029,
    # "tradeArea": 3030,
    # "tradeArea": 3031,
    # "tradeArea": 3032,
    "projectname": "",
    "tabType":""
}
#获取url地址上的所有url
def getWebUrl(_url,_data):
    print("解析网址中...")
    res = requests.post(_url,_data)
    soup = BeautifulSoup(res.text,"lxml")
    news_a = soup.find_all('a')
    sum_news = news_a
    news_url = []
    # 遍历news
    for k in sum_news:
        try:
            if  k.get('href') and k.get('href').find("/f/")>=0:
                news_url.append(k.get('href'))
        except AttributeError as e:
            continue
    return news_url

#数组去重方法
def noReaptArr(_score_arr):
    new_arr = []  # 创建一个新的数组来存储无重复元素的数组
    for element in _score_arr:
        if (element not in new_arr):
            new_arr.append(element)

    return new_arr

#查找每个政府采购网页的关键字
def findKeyWord(_url_arr):
    all_pro=""
    for k in _url_arr:
        res = requests.get(host + k)
        soup = BeautifulSoup(res.text, "lxml")
        news_a = soup.find_all('h2')
        str_pro = news_a[0].string
        print("分析项目《"+str_pro+"》中")
        for key in keys:
            if res.text.find(key)>=0:
                print("发现目标=============================="+key)
                all_pro+="项目名称:<<"+str_pro+">>----->出现了"+key+"\n"
                break;
            # else:
                # print("meiiyou")
    if len(all_pro)>0:
        return all_pro
    else:
        return "暂无数据"


#创建文件
def mkfile(_filename,_code):
    f = open(_filename, "w",encoding='utf-8')
    f.write(_code)
    f.close

#
all_url = getWebUrl(url,datas)
no_reapt_url_arr = noReaptArr(all_url)
print("\n共有"+str(len(no_reapt_url_arr))+"个项目需要分析\n其中有重复采购招标地址"+str(len(all_url)-len(no_reapt_url_arr))+"已经忽略\n")
projects = findKeyWord(no_reapt_url_arr)
mkfile("采购招标.txt",projects)
print("解析完成")

你可能感兴趣的:(python,爬虫)