遍历单个域名
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
html=urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bsobj=BeautifulSoup(html)
for link in bsobj.find('div',{'id':'bodyContent'}).findAll('a',href=re.compile("^(/wiki/)((?!:).)*$")):
if 'href' in link.attrs:
print(link.attrs['href'])
print(type(bsobj.find('div',{'id':'bodyContent'}))) #
print(type(bsobj.findAll('a',href=re.compile("^(/wiki/)((?!:).)*$")))) #
print(type(bsobj.find('div',{'id':'bodyContent'}).findAll('a',href=re.compile("^(/wiki/)((?!:).)*$")))) #
使用getlinks函数来获取url
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random
def getLinks(articleUrl):
html=urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bsobj=BeautifulSoup(html)
return bsobj.find('div',{'id':'bodyContent'}).findAll('a',href=re.compile("^(/wiki/)((?!:).)*$"))
if __name__=='__main__':
random.seed ( datetime.datetime.now ( ) )
links=getLinks("/wiki/Kevin_Bacon")
while len(links)>0:
newArticle=links[random.randint(0,len(links)-1)].attrs['href']
print(newArticle)
links=getLinks(newArticle)
将获取到的url保存到数据库中,自己提前建立好数据库
import mysql.connector #导入模块
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random
def insertdatabase(url):
# 设置连接属性
config = {
'host' : '127.0.0.1' ,
'user' : 'root' ,
'password' : '1234' ,
'port' : '3306' ,
'database' : 'url_save' ,
'charset' : 'utf8' # utf8 没有-
}
database = mysql.connector.connect ( **config )
cur=database.cursor() # 获取连接的cursor
cur.execute("create table if not EXISTS url_table_"+url[2:5]+"(ID int PRIMARY KEY auto_increment,URL VARCHAR(200));")
cur.execute("insert into url_table_"+url[2:5]+"(URL) VALUES('%s')"%url) #此处url不能写入到数据库中,values中不要忘记加单引号
print('写入成功')
database.commit() #不能忘记提交
cur.close()
database.close()
def getLinks ( articleUrl ) :
html = urlopen ( "http://en.wikipedia.org/wiki/Kevin_Bacon" )
bsobj = BeautifulSoup ( html )
return bsobj.find ( 'div' , { 'id' : 'bodyContent' } ).findAll ( 'a' ,href = re.compile ( "^(/wiki/)((?!:).)*$" ) )
if __name__ == '__main__' :
random.seed ( datetime.datetime.now ( ) )
links = getLinks ( "/wiki/Kevin_Bacon" )
while len ( links ) > 0 :
newArticle = links [ random.randint ( 0 , len ( links ) - 1 ) ].attrs [ 'href' ]
print(newArticle)
insertdatabase(newArticle)
links = getLinks ( newArticle )
链接去重
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
pages=set()
def getLinks(pageUrl):
global pages
html=urlopen('http://en.wikipedia.org'+pageUrl)
bsobj=BeautifulSoup(html)
try:
print(bsobj.h1.get_text())
print(bsobj.find(id='mw-content-text').findAll('p')[0].get_text())
print(bsobj.find(id='ca-edit').find("span").find('a').attrs['href'])
except AttributeError:
print("缺少一些属性")
for link in bsobj.findAll('a',href=re.compile("^(/wiki/)")):
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
newPage=link.attrs['href']
print('**************************************\n'+newPage)
pages.add(newPage)
getLinks(newPage)
getLinks("")
几个功能的组合,可以定向的获取內链或者外链,但是在运行中走了一会就提示403错误了。
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random
pages=set()
allExtLinks=set()
allIntLinks=set()
random.seed(datetime.datetime.now())
#获取页面内所有內链的列表
def getInternalLinks(bsobj,includeUrl):
internalLinks=[]
#找出所有以“/”开头的连接
for link in bsobj.findAll('a',href=re.compile("^(/|.*"+includeUrl+")")):
if link.attrs['href'] is not None:
if link.attrs['href'] not in internalLinks:
if link.attrs['href'].startswith("/"):
internalLinks.append(link.attrs['href'])
else:
internalLinks.append(link.attrs['href'])
return internalLinks
#获取页面内所有外链的列表
def getExternalLinks(bsobj,excludeUrl):
externalLinks=[]
#找出所有以“http”或者“www”开头并且不包含当前url的链接
for link in bsobj.findAll('a',href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")):
if link.attrs['href'] is not None:
if link.attrs['href'] not in externalLinks:
externalLinks.append(link.attrs['href'])
return externalLinks
def splitAddress(address):
addressParts=address.replace("http://","").split("/")
return addressParts
def getRandomExternalLink(startingPage):
html=urlopen(startingPage)
bsobj=BeautifulSoup(html)
externalLinks=getExternalLinks(bsobj,splitAddress(startingPage)[0])
if len(externalLinks)==0:
internalLinks=getInternalLinks(bsobj,startingPage)
return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
else:
return externalLinks[random.randint(0,len(externalLinks)-1)]
def followExternalOnly(startingSite):
externalLink=getRandomExternalLink(startingSite)
print("随机外链是:"+externalLink+"\n")
followExternalOnly(externalLink)
followExternalOnly("http://oreilly.com")
后面关于Scrapy的内容,由于仅支持python2.7版本,所以就先放一放