BeautifulSoup编写脚本实现内网网页链接爬取

#!/usr/bin/python3 env

# -*- coding:utf-8 -*-

"""

auther:xiaohong.d

data:2020-04-30

description: this python3 script for groovy script use parameters to filter one module engine tag list

from now on. this script need python3 requests,json,BeautifulSoup only running on slave node

"""

import requests,json

from bs4 import BeautifulSoup

 

class RequestsParam:

"""参数组合"""

def __init__(self,url,prefix,module,suffix,timeout):

self.path = url+prefix+module+suffix

self.timeout = timeout

 

class Retrieve:

"""获取内容"""

def __init__(self,requestsparam):

self.req = requestsparam

 

def getPage(self):

r = requests.get(self.req.path,timeout=self.req.timeout)

return r.text

 

class ParsingHTML:

"""解析返回"""

def __init__(self,content):

self.content = content

 

def parseVersionLi(self):

soup = BeautifulSoup(self.content,'lxml')

newlist = [ i.string.replace("/", "") for i in soup.find_all("a")]

return newlist[1:]

if __name__ == '__main__':

try:

opts, args = getopt.getopt(sys.argv[1:], "ha:")

except Exception as ex:

print (ex)

sys.exit()

args = "shield"

for tmp1, tmp2 in opts:

if tmp1 == "-a":

args = tmp2

if tmp1 == "-h":

print ("this is a help message")

 

rp = RequestsParam('http://*****:9999/','box/',args+'/','ine/',0.300)

retrieve = Retrieve(rp)

res = retrieve.getPage()

ph = ParsingHTML(res)

li = ph.parseVersionLi()

for i in li:

print (i)

你可能感兴趣的:(Python)