python 获取css,图片 思想

import urllib2
import os
from bs4 import BeautifulSoup 
import re
siteUrls = " "

url = "http://www.sina.com.cn"
def getContent(url):
    content = urllib2.urlopen(url).read()
    content = writeCss(url,content)
    content = writefileName(url,content)
    fileNames = re.findall(r'/[^\?]*\?([^/|^\?]*)$',url)
    fileName = fileNames[0]
    print fileName
    f = file(fileName+".html",'w')
    f.write(content)
    f.close()

def writeCss(url,content):
    soup = BeautifulSoup(content)
    csss = soup.findAll('link',attrs={'type':'text/css'})
    css_pat = re.compile('.*/(.*)\.css')
    fileNames = re.findall(r'/[^\?]*\?([^/|^\?]*)$',url)
    fileName = fileNames[0]
    print fileName
    for css in csss:
        cssnames = re.findall(r'.*/(.*)\.css',str(css))
        cssurls = re.findall(r'.*href=\"([^\"]*)\"',str(css))
#        print cssnames[0]
#        print cssurls[0]
        cssurl = "http://review.artintern.net/" + cssurls[0]
#        print cssurl
        content = content.replace(cssurls[0],fileName + "/" + cssnames[0]+".css")
        print os.path.isdir(fileName)
        if not os.path.isdir(fileName):
            os.mkdir(fileName)
        csscontent = urllib2.urlopen(cssurl).read()
        cssNewName = fileName+"/"+cssnames[0]+".css"
        cssfile = file(cssNewName,'w')
        cssfile.write(csscontent)
        cssfile.close()
    return content

def writefileName(url,content):
    soup = BeautifulSoup(content)
    imgs = soup.findAll('img')
    img_pat = re.compile('.*/(.*)\.[jpg|gif]')
    fileNames = re.findall(r'/[^\?]*\?([^/|^\?]*)$',url)
    fileName = fileNames[0]
    for img in imgs:
        imgNames = re.findall(r'.*/(.*)\.[jpg|gif]',str(img))
        imgType = re.findall(r'.*/.*\.([^ ]*)"',str(img))
        imgUrls = re.findall(r'.*src=\"([^\"]*)\"',str(img))
#        print imgNames[0]
#        print imgType[0]
#        print imgUrls[0]
        imgUrl = "http://review.artintern.net/" + imgUrls[0]
#        print imgUrl
        content = content.replace(imgUrls[0],fileName+"/"+imgNames[0]+"."+imgType[0])
        if not os.path.isdir(fileName):
            os.mkdir(fileName)
        imgContent = urllib2.urlopen(imgUrl).read()
        imgNewName = fileName+"/"+imgNames[0]+"."+imgType[0]
        imgfile = file(imgNewName,'w')
        imgfile.write(imgContent)
        imgfile.close()
    return content

getContent(url)


你可能感兴趣的:(python 获取css,图片 思想)