给娃下资料

当妈的容易吗,为了批量从网页上下载pdf,研究了一天python,终于找到个弄用的。

#!/usr/bin/python

# -*- coding:utf-8 -*-

import urllib2

import re

import os

# open the url and read

def getHtml(url):

page = urllib2.urlopen(url)

html = page.read()

page.close()

return html

# compile the regular expressions and find

# all stuff we need

def getUrl(html):

reg = r'(?:href|HREF)="?((?:http://)?.+?\.pdf)'

url_re = re.compile(reg)

url_lst = re.findall(url_re,html)

return(url_lst)

def getFile(url):

file_name = url.split('/')[-1]

u = urllib2.urlopen(url)

f = open(file_name, 'wb')

block_sz = 8192

while True:

buffer = u.read(block_sz)

if not buffer:

break

f.write(buffer)

f.close()

print "Sucessful to download" + " " + file_name

root_url = 'http://www.kizclub.com/'

raw_url = 'http://www.kizclub.com/body.htm'

html = getHtml(raw_url)

#print html

url_lst = getUrl(html)

#print url_lst

os.mkdir('test2')

os.chdir(os.path.join(os.getcwd(), 'test2'))

for url in url_lst[:]:

url = root_url + url

print url

getFile(url)

你可能感兴趣的:(给娃下资料)