python爬虫

sed -i -e 's/\r$//'  install_python.sh  windwos编码转linux

sudo !! 获取root权限

阿里云的rpm
http://mirrors.aliyun.com/centos/7/os/x86_64/Packages/
缓存rpm文件解决依赖  下载rpm文件离线安装
https://www.cnblogs.com/nmap/p/9511848.html

sudo rpm -Uvh ./gcc/*.rpm --nodeps --force   强制安装

sudo !! root权限
./configure --prefix=/usr 指定路径
make Makefile编译 gcc
make install 安装  make uninstall 是卸载
CC是gcc的连接,gcc是编译器 gcc包含很多编译器(C, C++, Objective-C, Ada, Fortran,and   Java)

第一版本

#!/bin/sh
#!/bin/bash
#用python3.8.2  windwos脚本和linux脚本编码不一样
echo "定义参数"
python_path="/opt/python/"
sh_path="/opt/nssa/"

sudo rpm -Uvh ./python/rpm/*.rpm --nodeps --force

if [ -d $python_path ]; then
    sudo rm -rf $python_path
fi
sudo tar -xvf ./python/Python-v3.8.2.tgz
sudo mv Python-3.8.2 /opt
sudo mv /opt/Python-3.8.2 /opt/python

sudo cp -r ./python/packages ${python_path}
sudo cp ./python/requirements.txt ${python_path}

if [ -d /opt/python3 ]; then
    sudo rm -rf /opt/python3
fi

#中间临时文件加权限
sudo mkdir /opt/python3
sudo chmod 777 -R /opt/python3/*
sudo chmod 777 -R /opt/python/*

cd $python_path
sudo ./configure --prefix=/opt/python3
sudo chmod 777 -R /opt/python3/*
sudo chmod 777 -R /opt/python/*
sudo make 
sudo chmod 777 -R /opt/python3/*
sudo chmod 777 -R /opt/python/*
sudo make install

#软连接
sudo rm -rf /usr/bin/python3
sudo rm -rf /usr/bin/pip3 
sudo ln -s /opt/python3/bin/python3.8 /usr/bin/python3
sudo ln -s /opt/python3/bin/pip3.8 /usr/bin/pip3
#安装第三方库
sudo pip3 install --no-index --find-links=$python_path/packages -r $python_path/requirements.txt
cd $sh_path
sudo python3 CNVD.py >/dev/null 2>&1 &

第二版本

#!/bin/sh
#!/bin/bash
#用python3.8.2  windwos脚本和linux脚本编码不一样
echo "定义参数"
python_path="/opt/python/"
sh_path="/opt/nssa/"

if [ -d $python_path ]; then
    sudo rm -rf $python_path
fi

sudo tar -xvf ./python/Python-v3.8.2.tgz
sudo mv Python-3.8.2 /opt
sudo mv /opt/Python-3.8.2 /opt/python

sudo cp -r ./python/packages ${python_path}
sudo cp ./python/requirements.txt ${python_path}

cd $python_path
./configure --prefix=/usr/local/python3
sudo make && make install
#软连接
sudo rm -rf /usr/bin/python3
sudo rm -rf /usr/bin/pip3 
sudo ln -s /usr/local/python3/bin/python3.8 /usr/bin/python3
sudo ln -s /usr/local/python3/bin/pip3.8 /usr/bin/pip3
#安装第三方库 pip3 list查看第三方   pip3 show requests版本和安装位置  
#离线 pip3 freeze > requirements.txt   离线安装 pip3 install --no-index --find-links=DIR -r requirements.txt
#打包也需要切源pip3 download -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com -d DIR -r requirements.txt
#sudo pip3 install requests -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
#sudo pip3 install beautifulsoup4 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
#sudo pip3 install lxml  -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
#sudo pip3 install pymysql  -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
sudo pip3 install --no-index --find-links=${python_path}packages -r ${python_path}requirements.txt
cd $sh_path
sudo python3 CNVD.py


import requests
from bs4 import BeautifulSoup
import time
import pymysql
#mysql配置
hosts='XXXX'
ports=XXXX
users='XXXX'
passwds='XXXX'
dbs='XXXX'

headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'}
ur = ['29','32','28','27','30','31']
#国家信息安全漏洞共享平台
urli = "https://www.cnvd.org.cn/flaw/typeResult?typeId=";
urlii = "https://www.cnvd.org.cn"

def main():
    for u in ur:
        url = urli+u
        getDataByurl(url)
        
def getDataByurl(url):
    response = requests.get(url=url,headers=headers)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'lxml')
    href = soup.find_all('a')
    for h in href:
        try:
            time.sleep(10);#反爬虫检测机制
            urll = urlii+h.get('href')
            response1 = requests.get(url=urll,headers=headers)
            response1.encoding = 'utf-8'
            soup1 = BeautifulSoup(response1.text, 'lxml')
            title = soup1.find('h1').text
            urlll = urll
            all = soup1.find('table',class_="gg_detail").select('td')
            date = all[3].get_text().strip()
            level = all[5].get_text().strip()[0]
            product = all[7].get_text().strip()
            hole = all[9].get_text().strip()
            func = all[15].get_text().strip()
            patch =  all[17].get_text().strip()
            sql = "INSERT into reptile (url,title, date,`level`,product,hole,func,patch) values ('"+urlll+"','"+title+"','"+date+"','"+level+"','"+product+"','"+hole+"','"+func+"','"+patch+"');"
            query(sql)
        except Exception as e:
            print('error')
        finally:
       
        
def query(sql):
    #参数 charset='utf8mb4' 
    conn = pymysql.connect(host=hosts, port=ports, user=users, passwd=passwds, db=dbs)
    cur = conn.cursor()
    reCount = cur.execute(sql)
    conn.commit(); 
    cur.close();
    conn.close();

if __name__ == '__main__':
    main();
    
    
#href = soup.find_all('a',class_="current")
# li = soup.find('li')
# print('find_li:',li)
# print('li.text(返回标签的内容):',li.text)
# print('li.attrs(返回标签的属性):',li.attrs)
# print('li.string(返回标签内容为字符串):',li.string)
# 常用通过find_all()方法来查找标签元素:<>.find_all(name, attrs, recursive, string, **kwargs) ,返回一个列表类型,存储查找的结果 

# • name:对标签名称的检索字符串
# • attrs:对标签属性值的检索字符串,可标注属性检索
# • recursive:是否对子孙全部检索,默认True
# • string:<>…中字符串区域的检索字符串

# soup = BeautifulSoup(html, 'lxml') 
# print(type(soup.select('title'))) 
# print(soup.select('title')[0].get_text()) 
# for title in soup.select('title'):     
# print(title.get_text()) 
 

你可能感兴趣的:(linux)