day01-urllib

爬虫过程

  • 请求网站,获取源码: urllib、requests、selenium、pyquery
  • 解析源码: 正则表达式,lxml.etree、beautifulsoup4、selenium
  • 存储数据: pymysql、pymongo
    请求网站 -- -- 自动化

项目初建

  • python -m venv 1902spiderenv(环境名)
  • 进入E盘建文件,pycharm打开并配置环境

1. urllib语法

1.1 语法

  • 传入data则表示发送POST请求,不传入data则表示发送GET请求
  • urllib.request.urlopen(url, data, timeout)

# 1.1 只传入地址url的情况
url = 'http://www.baidu.com'
response = urllib.request.urlopen(url)
# read()读取响应内容,默认格式为bytes
text = response.read().decode('utf-8')

# 1.2 传入地址url和参数data的情况
# 参数放在url地址中,请求方式为GET
url = 'http://www.baidu.com/s?wd=python'
response = urllib.request.urlopen(url)
text = response.read().decode('utf-8')

# 1.3 参数放在urlopen(data)中,请求方式为POST
url = 'http://www.baidu.com/s'
data = {
    'wd': 'python'
}
# urlencode()方法将字典转化为key=value的形式
data = urllib.parse.urlencode(data)
# bytes()方法将data转化为bytes类型
data = bytes(data, encoding='utf-8')
response = urllib.request.urlopen(url, data)
text = response.read().decode('utf-8')


# 1.4 传入timeout参数 - 规定在多少时间内响应
url = 'http://www.baidu.com'
try:
    response = urllib.request.urlopen(url, timeout=0.01)
    text = response.read().decode('utf-8')
    print(text)
except urllib.error.URLError as e:
    print('超时')


2. 语法2 - urllib.request

  • request = urllib.request.Request(url, data, headers, method)
  • urllib.request.urlopen(对象request)

# Request(url)只传入地址url的情况
url = 'https://movie.douban.com/top250'
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
text = response.read().decode('utf-8')
# print(text)

# Request(url, data)传入url和参数data的情况
# data是bytes类型,请求为POST
url = 'http://www.baidu.com/s'
data = {
    'wd': 'python'
}
data = urllib.parse.urlencode(data)
# 把字符串类型的data转成bytes类型
data = bytes(data, encoding='utf-8')
request = urllib.request.Request(url, data)
response = urllib.request.urlopen(request)
text = response.read().decode('utf-8')


# Request(url, headers), headers表示请求头, 如User_Agent参数
# User_Agent参数可被服务器端获取进行判断 判断该请求为爬虫还是人工

url = 'http://httpbin.org/get'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'

}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
text = response.read().decode('utf-8')


# 修改代理IP
url = 'http://httpbin.org/get'
proxies = {
    'http': 'http://59.37.33.62:50686'
}
proxy_handler = urllib.request.ProxyHandler(proxies=proxies)
opener = urllib.request.build_opener(proxy_handler)
# 设置User-Agent
# 设置User-Agent方法1
opener.addheaders = [('User-Agent', '')]
opener.open(url)
# 设置User-Agent方法2

response = opener.open(url)
text = response.read().decode('utf-8')
print(text)

2. urllib爬取boss直聘

"""__author__= 雍新有"""
import urllib.request
import re


def get_html(url):
    # 使用urllib库获取url所对应的源码
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
    }
    # ip代理
    proxies = {
        'http': 'http://113.120.63.179:9999'
    }
    proxy_handler = urllib.request.ProxyHandler(proxies=proxies)
    opener = urllib.request.build_opener(proxy_handler)

    request = urllib.request.Request(url, headers=headers)
    # response = urllib.request.urlopen(request)
    response = opener.open(request)
    # read()返回bytes类型的源码内容
    return response.read().decode('utf-8')


def parse_html(html):
    # 解析源码,获取岗位名称、工资等信息
    patterns = re.compile('
(.*?)
.*?(.*?).*?
.*?(.*?)', re.S) result = patterns.findall(html) return result def main(): # 成都boss招聘的地址 url = 'https://www.zhipin.com/c101270100/?query=python&page=%s&ka=page-%s' for i in range(1, 6): # i为1,2,3,4,5 real_url = url % (i, i) print(real_url) # 获取地址所对应的源码内容 html = get_html(real_url) # print(html) # 解析页面源码中的岗位信息 result = parse_html(html) print(result) if __name__ == '__main__': main()

3. urllib爬取猫眼 - 自恋(没有存数据库)

"""__author__= 雍新有"""
import urllib.request
import re


def parse_html(html):
    # 注意空格
    patterns = re.compile('(.*?).*?.*?.*?

.*?(.*?)

.*?\

(.*?)

.*?

(.*?)

.*?(.*?).*?(.*?)', re.S) result = patterns.findall(html) # Python strip()方法用于移除字符串头尾指定的字符(默认为空格或换行符)或字符序列。 # 注意:该方法只能删除开头或是结尾的字符,不能删除中间部分的字符。 for item in result: yield { 'index': item[0], 'img': item[1], 'title': item[2], 'actors': item[3].strip(), 'time': item[4], 'score': item[5] + item[6], } return result def get_html(url): # 使用urllib库获取url所对应的源码 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36' } request = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(request) return response.read().decode('utf-8') def main(): # 猫眼电影地址 url = 'https://maoyan.com/board/4?offset=%s' for i in range(0, 10): m = i * 10 real_url = url % m print(real_url) # 获取页面对应的源码内容 html = get_html(real_url) # print(html) # 获取猫眼电影top100 result = parse_html(html) for item in result: print(item) if __name__ == '__main__': main()

你可能感兴趣的:(day01-urllib)