爬虫示例
示例1:爬取图片
import os
import re
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
url = "http://www.xiachufang.com"
ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 ' \
'Safari/537.36 Edge/18.17763'
headers = {'User-Agent': ua}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
img_list = []
for img in soup.select('img'):
if img.has_attr('data-src'):
img_list.append(img.attrs['data-src'])
else:
img_list.append(img.attrs['src'])
img_dir = os.path.join(os.curdir, 'images')
if not os.path.isdir(img_dir):
os.mkdir(img_dir)
for img in tqdm(img_list):
li = re.findall(r"^(http\S+/)(\w+\.(png|jpg|jpeg|bmp))", img)
if li:
filename = li[0][1]
filepath = os.path.join(img_dir, filename)
url_ = '%s%s' % (li[0][0], li[0][1])
resp = requests.get(url_)
with open(filepath, 'wb') as f:
for chunk in resp.iter_content(1024):
f.write(chunk)
示例2:爬取链接和表格信息
import requests
from lxml import etree
def not_empty(str):
return str and str.strip()
def space_filter_and_join(li):
return ''.join(list(filter(not_empty, li)))
def fetch(url):
"""请求并下载网页"""
r = requests.get(url)
if r.status_code != 200:
r.raise_for_status()
return r.text
def parse_university(url):
"""
处理大学详情页面
:param url: 网页链接
:return: 表格数据
"""
s = etree.HTML(fetch(url))
data = dict()
data['name'] = s.xpath('//div[@id="wikiContent"]/h1/text()')[0]
table = s.xpath('//div[@id="wikiContent"]/div[@class="infobox"]/table')
if table:
table = table[0]
col1 = table.xpath('.//td[1]')
col2 = table.xpath('.//td[2]')
keys, values = [[space_filter_and_join(col.xpath('.//text()')) for col in cols] for cols in (col1, col2)]
if len(keys) != len(values):
return None
data.update(zip(keys, values))
return data
def process_data(data):
if data:
print(data)
if __name__ == '__main__':
selector = etree.HTML(fetch('http://www.qianmu.org/ranking/1528.htm'))
links = selector.xpath('//div[@class="rankItem"]//tr[position()>1]/td/a/@href')
for link in links:
if not link.startswith('http://www.qianmu.org'):
continue
data = parse_university(link)
process_data(data)
示例3:多线程处理数据
import time
import requests
import threading
from queue import Queue
from lxml import etree
link_queue = Queue()
threads_num = 10
pages_num = 0
threads = []
THREAD_ON = True
def not_empty(str):
return str and str.strip()
def space_filter_and_join(li):
return ''.join(list(filter(not_empty, li)))
def fetch(url):
"""请求并下载网页"""
r = requests.get(url)
if r.status_code != 200:
r.raise_for_status()
global pages_num
pages_num += 1
return r.text
def parse_university(url):
"""
处理大学详情页面
:param url: 网页链接
:return: 表格数据
"""
s = etree.HTML(fetch(url))
data = dict()
data['name'] = s.xpath('//div[@id="wikiContent"]/h1/text()')[0]
table = s.xpath('//div[@id="wikiContent"]/div[@class="infobox"]/table')
if table:
table = table[0]
col1 = table.xpath('.//td[1]')
col2 = table.xpath('.//td[2]')
keys, values = [[space_filter_and_join(col.xpath('.//text()')) for col in cols] for cols in (col1, col2)]
if len(keys) != len(values):
return None
data.update(zip(keys, values))
return data
def process_data(data):
if data:
print(data)
def download():
while THREAD_ON:
link = link_queue.get()
data = parse_university(link)
process_data(data)
link_queue.task_done()
print('Remaining queue: %d' % link_queue.qsize())
if not link_queue.qsize():
break
print('--> {}退出'.format(threading.current_thread().name))
if __name__ == '__main__':
start_time = time.time()
selector = etree.HTML(fetch('http://www.qianmu.org/ranking/1528.htm'))
links = selector.xpath('//div[@class="rankItem"]//tr[position()>1]/td/a/@href')
for link in links:
if not link.startswith('http://www.qianmu.org'):
continue
link_queue.put(link)
for k in range(threads_num):
t = threading.Thread(target=download, name='线程%d' % k)
t.start()
threads.append(t)
link_queue.join()
THREAD_ON = False
for t in threads:
t.join()
cost_time = time.time() - start_time
print('Download {} pages, cost {} seconds'.format(pages_num, cost_time))
示例4:分布式处理数据
import sys
import time
import redis
import requests
import threading
from lxml import etree
threads_num = 10
pages_num = 0
threads = []
red = redis.Redis()
THREAD_ON = True
def not_empty(str):
return str and str.strip()
def space_filter_and_join(li):
return ''.join(list(filter(not_empty, li)))
def fetch(url):
"""请求并下载网页"""
r = requests.get(url)
if r.status_code != 200:
r.raise_for_status()
global pages_num
pages_num += 1
return r.text
def parse_university(url):
"""
处理大学详情页面
:param url: 网页链接
:return: 表格数据
"""
s = etree.HTML(fetch(url))
data = dict()
data['name'] = s.xpath('//div[@id="wikiContent"]/h1/text()')[0]
table = s.xpath('//div[@id="wikiContent"]/div[@class="infobox"]/table')
if table:
table = table[0]
col1 = table.xpath('.//td[1]')
col2 = table.xpath('.//td[2]')
keys, values = [[space_filter_and_join(col.xpath('.//text()')) for col in cols] for cols in (col1, col2)]
if len(keys) != len(values):
return None
data.update(zip(keys, values))
return data
def process_data(data):
if data:
print(data)
def download():
while THREAD_ON:
link = red.lpop('queue')
if link:
data = parse_university(link)
process_data(data)
print('Remaining queue: %d' % red.llen('queue'))
else:
break
print('--> {}退出'.format(threading.current_thread().name))
if __name__ == '__main__':
start_time = time.time()
if len(sys.argv) > 1:
start_url = sys.argv[1]
selector = etree.HTML(fetch(start_url))
links = selector.xpath('//div[@class="rankItem"]//tr[position()>1]/td/a/@href')
for link in links[:30]:
if not link.startswith('http://www.qianmu.org'):
continue
if red.sadd('seen', link):
red.rpush('queue', link)
else:
for k in range(threads_num):
t = threading.Thread(target=download, name='线程%d' % k)
t.start()
threads.append(t)
for t in threads:
t.join()
red.delete('queue')
red.delete('seen')
cost_time = time.time() - start_time
print('Download {} pages, cost {} seconds'.format(pages_num, cost_time))