利用beautifulsoup下载网页html代码中的css, js, img文件并保存

# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup as BS 
import urllib.request as rqst
import os

url = 'http://xxxxxxx'
headers = {'User-Agent': 'xxxxxx(这个网上随便找一个都可以)','Accept-Encoding':'utf-8'}
r = rqst.Request(url, headers=headers) 
html
= rqst.urlopen(url) #网页用bs解析 bs = BS(req, 'lxml') #获取css,js,img文件的路由 elc = bs.find_all('link', type='text/css') elj = bs.find_all('script') eli = bs.find_all('img') #保存css,js,img文件

  for c in elc:
    url = c['href'] #如果href不完整需要自己调整,下面的一样

    file = url.split('/')[-1] #获取文件名

    if(os.path.exists (file)==False):
      try:
        res = rqst.urlopen(url)
        txt = res.read()
        with open(file, 'wt', encoding='utf-8') as f:
          f.write(txt)
          f.close()
      except Exception:
        pass

  for j in elj:
    if(i.has_attr('src')):
      url = j['src'] 
      file = url.split('/')[-1]
      if(os.path.exists(file)==False):
        try:
          res = rqst.urlopen(url)
          txt = res.read()
          with open(file, 'wt', encoding='utf-8') as f:
            f.write(txt)
            f.close()
        except Exception:
          pass

  for i in eli:
    url = i['src']
    url = 'http://www.fmhhqb.com'+url
    file = url.split('/')[-1]
    if(os.path.exists(file)==False):
      try:
        r = getRequest(url)
        res = rqst.urlopen(r)
        txt = res.read()
        with open(file, 'wb') as f:
          f.write(txt)
          f.close()
      except Exception:
        pass

 

你可能感兴趣的:(利用beautifulsoup下载网页html代码中的css, js, img文件并保存)