爬虫入门

import urllib.request
#获取post请求(模拟真实用户登录)
import urllib.parse
"""
data = bytes(urllib.parse.urlencode({"hello":"world"}),encoding="utf-8")#伪造身份码
response = urllib.request.urlopen("http://httpbin.org/post",data=data)#网站
print(response.read().decode("utf-8"))
"""
#获取get请求
"""
response=urllib.request.urlopen("http://acm.zzuli.edu.cn/")#网站
print(response.read().decode("utf-8"))#获取网站源代码并用uft-8进行解码-
"""
#超时处理
"""
try:
    response = urllib.request.urlopen("http://httpbin.org/post",timeout=100)#设置请求时间timeout=
    print(response.read().decode("utf-8"))
except urllib.error.URLError as e:#
    print("time out!")
"""
#获取header信息F12
"""
response = urllib.request.urlopen("http://baidu.com")
# print(response.status)
print(response.getheaders("Server"))
"""
#伪造身份登录zzulioj
"""
url="http://acm.zzuli.edu.cn/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75"}
req=urllib.request.Request(url=url,headers=headers)#封装信息,如果是post则加入,method="POST"
response=urllib.request.urlopen(req)#发送网页请求
print(response.read().decode("utf-8"))
"""

def main():
    baseur = "http://acm.zzuli.edu.cn/"
    #爬取网页
    datalist = getData(baseurl)
    savepath = "doubAmn"
    #保存数据
    # saveData(savepath)
    askURl("http://acm.zzuli.edu.cn/")

#爬取网页
def getData(baseurl):
    datalist=[]
    #逐一解析数据
    return datalist

#得到一个指定url的网页内容
def askURL(url):
    head={
     
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75"
    }
    #用户代理,告诉服务器,我们是什么类型的机器,(本质上是告知服务器我们能接受什么水平的信息)
    request = urllib.request.Request(url,headers=header)
    html = ""
    try:
        request =urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
    except urllib.error.URLError as e:
        if hasattr(e,"code"):
            print(e.code)
        if hasattr(e,"reason"):
            print(e.reason)
    return html

你可能感兴趣的:(笔记)