【学习笔记】Python网络数据采集——通过互联网采集

收集整个网站数据

加载各项模块

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import ssl
import datetime
import random

码代码是件十分欢喜的事,就懒得处理pycharm证书问题

ssl._create_default_https_context = ssl._create_unverified_context 

基于当前时间生成随机种子

random.seed(datetime.datetime.now())

获取页面内所有内链列表

def get_internal_links(bsObj, internal_url):
	internal_links = []
	# 找出所有以'/'或以'.'开头的内链
	for link in bsObj.findAll('a', href=re.compile
		("^(/|.*'+internal_url+')")):
		# 若链接存在
		if link.attrs['href'] is not None:
			# 若为新链接
			if link.attrs['href'] not in internal_links:
				internal_links.append(link.attrs['href'])
	return internal_links

获取页面内所有外链列表

def get_external_links(bsObj, external_url):
	external_links = []
	# 找出以'http'或'www'开头并且不包含当前URL的外链
	for link in bsObj.findAll('a', href=re.compile("^(http|www)
		((?!'+external_url+').)*$")):
		if link.attrs['href'] is not None:
			if link.attrs['href'] not in external_links:
				external_links.append(link.attrs['href'])
	return external_links

分割地址

# 分割地址以便后续函数获得当前页面URL
def split_address(address):
	address_parts = address.replace('http://', '').split('/')
	return address_parts

获取下一随机外链

def get_random_external_link(starting_page):
	html = urlopen(starting_page)
	bsObj = BeautifulSoup(html, 'html.parser')
	# 调用外链列表获取函数-获取不含当前页面URL外链列表
	external_links = get_external_links(bsObj, split_address(starting_page)
		[0])
	# 若在当前页面未找到不含当前页面URL外链
	if len(external_links) == 0:
		# 以当前页面的内链列表中随机一个内链作为随机外链
		internal_links = get_internal_links(bsObj, starting_page)
		return internal_links[random.randint(0, len(internal_links)-1)]
	else:
		return external_links[random.randint(0, len(external_links)-1)]

追寻获得的唯一随机外链

def follow_external_link_only(starting_site):
	# 此处'starting site'要替换为调用函数中的指定'starting_site'
	external_link_only = get_random_external_link('starting site')
	print('获得外链:', external_link_only)
	# 循环调用追寻外链函数
	follow_external_link_only(external_link_only)

调用函数

follow_external_link_only('starting_site')

以百度百科(http://baike.baidu.com)为例

完整代码如下:

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import ssl
import datetime
import random

ssl._create_default_https_context = ssl._create_unverified_context
random.seed(datetime.datetime.now())


def get_internal_links(bsObj, internal_url):
    internal_links = []
    # 找出所有以'/'或以'.'开头的内链
    for link in bsObj.findAll('a', href=re.compile("^(/|.*'+internal_url+')")):
        # 若链接存在
        if link.attrs['href'] is not None:
            # 若为新链接
            if link.attrs['href'] not in internal_links:
                internal_links.append(link.attrs['href'])
    return internal_links


def get_external_links(bsObj, external_url):
    external_links = []
    # 找出以'http'或'www'开头并且不包含当前URL的外链
    for link in bsObj.findAll('a', href=re.compile("^(http|www)((?!'+external_url+').)*$")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in external_links:
                external_links.append(link.attrs['href'])
    return external_links


# 分割地址以便后续函数获得当前页面URL
def split_address(address):
    address_parts = address.replace('http://', '').split('/')
    return address_parts


def get_random_external_link(starting_page):
    html = urlopen(starting_page)
    bsObj = BeautifulSoup(html, 'html.parser')
    # 调用外链列表获取函数-获取不含当前页面URL外链列表
    external_links = get_external_links(bsObj, split_address(starting_page)[0])
    # 若在当前页面未找到不含当前页面URL外链
    if len(external_links) == 0:
        # 以当前页面的内链列表中随机一个内链作为随机外链
        internal_links = get_internal_links(bsObj, starting_page)
        return internal_links[random.randint(0, len(internal_links)-1)]
    else:
        return external_links[random.randint(0, len(external_links)-1)]


def follow_external_link_only(starting_site):
    # 此处'starting site'要替换为调用函数中的指定'starting_site'
    external_link_only = get_random_external_link('http://baike.baidu.com')
    print('获得外链:', external_link_only)
    # 循环调用追寻外链函数
    follow_external_link_only(external_link_only)


follow_external_link_only('http://baike.baidu.com')

运行结果如下:

你可能感兴趣的:(【学习笔记】Python网络数据采集——通过互联网采集)