2.python爬虫基础—

#python中Urllib库实战
#系统学习urllib模块，从urllib基础开始。学习urlretrieve()，urlcleanup(),info(),getcode(),geturl()
import urllib.request
#urlretrieve() 直接将一个网页爬到本地
urllib.request.urlretrieve("http://www.hellobi.com",filename="/Users/xubin/myapp/pythonfile/urlretrieve.html")

#urlcleanup() 将urlretrieve产生的缓存，清空
urllib.request.urlcleanup()

#info()  将一些基础的环境信息展示粗来
file=urllib.request.urlopen("http://www.hellobi.com")
print(file.info())

#getcode() 获取访问url的状态码，返货200，
print(file.getcode())

#geturl()  获取爬取得网址
print(file.geturl())

#超时设置
#爬取一个网页，需要时间。访问网页，网页长时间未响应，系统判断网页超时了，无法打开网页。
#服务器反应快设置2秒没反应未超时，如果服务器反应慢设置100秒没反应未超时，timeout超时时间为2 100
file=urllib.request.urlopen("http://www.hellobi.com",timeout=1)

for i in range(0,10):
    try:
        file=urllib.request.urlopen("http://yum.iqianyue.com",timeout=0.1)
        data=file.read()
        print(len(data))
    except Exception as e:
        print("出现异常："+str(e))

#自动模拟http请求
#客户端如果要与服务器端进行通信，需要通过http请求进行，http请求有很多种
#主要涉及post，get两种方式，比如登录，搜索某些信息的时候会用到
#一般登录某个网站的时候，需要post请求
#一般搜索某些信息的时候，需要get请求

#在百度上搜索关键词，用python实现，需要用到请求，get  get请求URL中有？
#https://www.baidu.com/s?wd=python
import urllib.request
import re
keywd="徐彬"
keywd=urllib.request.quote(keywd)
url="http://www.baidu.com/s?wd="+keywd    #注意不能用https
req=urllib.request.Request(url)
data=urllib.request.urlopen(req).read()
fh=open("/Users/xubin/myapp/pythonfile/百度python.html","wb")
fh.write(data)
fh.close()

#post请求  比如需要登录用户  需要提交post请求
#http://passport.csdn.net/account/login    用户名：username  密码：password
import urllib.request
import urllib.parse
url="https://passport.csdn.net/account/login"
mydata=urllib.parse.urlencode({
      "username":"bingoxubin","password":"19900127LLBingo"}).encode("utf-8")
req=urllib.request.Request(url,mydata)
data=urllib.request.urlopen(req).read()
fh=open("/Users/xubin/myapp/pythonfile/csdn登录界面.html","wb")
fh.write(data)
fh.close()


'''
#爬取oa上的所有照片，存到OA照片.docx中  #遇到问题，目前所学，只能爬取单页的内容
import re
import urllib.request

data=urllib.request.urlopen("oa.epoint.com.cn").read()
data=data.decode("utf-8")
pat=""
mydata=re.compile(pat).findall(data)
fh=open("/Users/xubin/myapp/pythonfile/OA照片.docx","w")
for i in range(0,len(mydata)):
    fh.write(mydata[i]+"\n")
fh.close()
'''
2.python爬虫基础——Urllib库

你可能感兴趣的:(python,爬虫,运维)