利用urllib从ks5u下载2016高考真题

# coding: utf-8
import urllib.request
import http.cookiejar
import logging,os
from lxml import etree

#配置日志级别为INFO
logging.basicConfig(level=logging.INFO)

#启用cookie自动管理
cj=http.cookiejar.CookieJar()
opener=urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
urllib.request.install_opener(opener)

#post登陆请求参数
values={'userid':'******','userpass':'******'}
data = urllib.parse.urlencode(values)
data = data.encode('ascii') # data should be bytes
ks5u_login_url='http://www.ks5u.com//User/Chk_UserLogin.asp'
req = urllib.request.Request(ks5u_login_url, data)

#登陆网站,获得授权
logging.info('开始登陆网站!')
with urllib.request.urlopen(req) as response:
    html=response.read()
    # with open('ks5u_login.html','w') as outfile:
        # outfile.write(html.decode('gb2312'))
logging.info('登陆成功!')

logging.info('访问2016高考真题下载页面!')
all_url='http://old.ks5u.com/shiti/gaokao/2016/'
with urllib.request.urlopen(all_url) as response:
    html=response.read().decode('utf-8')
    # with open('ks5u.html','w',encoding='utf-8') as outfile:
        # outfile.write(html)
        # logging.info('写入文件ks5u.html成功!')
    root=etree.HTML(html)  #利用lxml解析获得的html文件
    #利用xpath搜索节点
    #'//div[@class="sub_jiexi"]'
    #//表示任意节点
    #div节点名称
    #[@...]属性满足的条件
    for div in root.xpath('//div[@class="sub_jiexi"]'):
        for a in div:
            if a.text=='解析':
                # headers中不能包含中文字符,否则'latin-1'不能编码
                # 某些站点有所谓的反盗链设置,其实说穿了很简单,
                # 就是检查你发送请求的header里面,referer站点是不是他自己,
                # 所以我们只需要像把headers的referer改成该网站即可
                headers = {
                    'Referer':a.get('href')
                }
                fileid=a.get('href').split('/')[-1].split('.')[0]
                doc_url='http://www.ks5u.com/USER/INC/Downsch.asp?id='+fileid
                req = urllib.request.Request(doc_url, None, headers) 
                logging.info('开始下载编号为%s的文件'%(fileid,))
                if not fileid or not fileid.isdigit():
                    continue
                with urllib.request.urlopen(req) as r2:
                    # 取得doc文件名时编码的转换由ISO-8859-1转为gbk
                    filename=r2.headers.get_filename().encode('ISO-8859-1').decode('gbk')
                    #如果文件存在,跳过
                    if os.path.exists(filename):
                        logging.info('文件%s已经存在!'%filename)
                        continue
                    logging.info('获得文件名%s'%filename)
                    #下载文件
                    with open(filename,'wb') as outfile:
                        try:
                            outfile.write(r2.read())
                            logging.info('保存文件%s成功!'%(filename,))
                        except :
                            pass

你可能感兴趣的:(利用urllib从ks5u下载2016高考真题)