import warnings
warnings.filterwarnings("ignore")
初见网络爬虫
调用 BeautifulSoup对象
from urllib.request import urlopen
from bs4 import BeautifulSoup
html=urlopen("http://pythonscraping.com/pages/page1.html")
bs0bj=BeautifulSoup(html.read())
print(bs0bj)
print(bs0bj.h1)
复杂的HTML解析
findAll()
from urllib.request import urlopen
from bs4 import BeautifulSoup
html=urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bs0bj=BeautifulSoup(html)
namelist=bs0bj.findAll("span",{"class":"green"})
for name in namelist:
print(name.get_text())
BeautifulSoup的find()和findAll()
from urllib.request import urlopen
from bs4 import BeautifulSoup
html=urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bs0bj=BeautifulSoup(html)
allText=bs0bj.findAll("",{"id":"text"})
for i in allText:
print(i.get_text())
处理子标签(.children)
from urllib.request import urlopen
from bs4 import BeautifulSoup
html=urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj= BeautifulSoup(html)
for child in bs0bj.find("table",{"id":"giftList"}).children:
print(child)
处理兄弟标签(.next_siblings)`
from urllib.request import urlopen
from bs4 import BeautifulSoup
html=urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj= BeautifulSoup(html)
for child in bs0bj.find("table",{"id":"giftList"}).tr.next_siblings:
print(sibling)
父标签处理()
from urllib.request import urlopen
from bs4 import BeautifulSoup
html=urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj= BeautifulSoup(html)
print(bs0bj.find("img",{"src":"../img/gifts/img1.jpg"
}).parent.previous_sibling.get_text())
正则表达式和BeautifulSoup
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
html=urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj= BeautifulSoup(html)
images=bs0bj.findAll("img",{"src":re.compile("\.\.\/img\/gifts\/img.*\.jpg")})
for image in images:
print(image["src"])
lambda表达式
开始采集
遍历单个域名
from urllib.request import urlopen
from bs4 import BeautifulSoup
html=urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bs0bj=BeautifulSoup(html)
for line in bs0bj.findAll("a"):
if "href" in line.attrs:
print(line.attrs["href"])
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
html=urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bs0bj=BeautifulSoup(html)
for line in bs0bj.find("div",{"id":"bodyContent"}).findAll("a",{"href":re.compile("^\/wiki\/((?!:).)*$")}):
if "href" in line.attrs:
print(line.attrs["href"])
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import random
import datetime
random.seed(datetime.datetime.now())
def getlines(articleUrl):
html=urlopen("http://en.wikipedia.org"+articleUrl)
bs0bj=BeautifulSoup(html)
return bs0bj.find("div",{"id":"bodyContent"}).findAll("a",{"href":re.compile("^\/wiki\/((?!:).)*$")})
links=getlines("/wiki/Kevin_Bacon")
print(links)
while len(links)>0:
newArticle=links[random.randint(0,len(links)-1)].attrs["href"]
print(newArticle)
links=getlines(newArticle)
采集整个网站
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
pages=set()
def getlinks(pageUrl):
global pages
html=urlopen("http://en.wikipedia.org"+ pageUrl)
bs0bj=BeautifulSoup(html)
for link in bs0bj.findAll("a",{"href":re.compile("^(\/wiki\/)")}):
if "href" in link.attrs:
if link.attrs["href"] not in pages:
newpage=link.attrs["href"]
print(newpage)
pages.add(newpage)
getlinks(newpage)
getlinks(" ")
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
pages=set()
def getlinks(pageUrl):
global pages
html=urlopen("http://en.wikipedia.org"+ pageUrl)
bs0bj=BeautifulSoup(html)
try:
print(bs0bj.h1.get_text())
print(bs0bj.find(id="mw-content-text").findAll("p")[0])
print(bs0bj.find(id="ca-edit").find("span").find("a").attr["href"])
except AttributeError:
print("页面缺少一些属性!不过不用担心")
for link in bs0bj.findAll("a",{"href":re.compile("^(\/wiki\/)")}):
if "href" in link.attrs:
if link.attrs["href"] not in pages:
newpage=link.attrs["href"]
print(newpage)
pages.add(newpage)
getlinks(newpage)
getlinks(" ")
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random
pages = set()
random.seed(datetime.datetime.now())
def getInternalLinks(bsObj, includeUrl):
includeUrl = urlparse(includeUrl).scheme+"://"+urlparse(includeUrl).netloc
internalLinks = []
for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):
if link.attrs['href'] is not None:
if link.attrs['href'] not in internalLinks:
if(link.attrs['href'].startswith("/")):
internalLinks.append(includeUrl+link.attrs['href'])
else:
internalLinks.append(link.attrs['href'])
return internalLinks
def getExternalLinks(bsObj, excludeUrl):
externalLinks = []
for link in bsObj.findAll("a", href=re.compile(
"^(http|www)((?!"+excludeUrl+").)*$")):
if link.attrs['href'] is not None:
if link.attrs['href'] not in externalLinks:
externalLinks.append(link.attrs['href'])
return externalLinks
def getRandomExternalLink(startingPage):
html = urlopen(startingPage)
bsObj = BeautifulSoup(html,"html.parser")
externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
if len(externalLinks) == 0:
print("No external links, looking around the site for one")
domain = urlparse(startingPage).scheme+"://"+urlparse(startingPage).netloc
internalLinks = getInternalLinks(bsObj, domain)
return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
else:
return externalLinks[random.randint(0, len(externalLinks)-1)]
def followExternalOnly(startingSite):
externalLink = getRandomExternalLink(startingSite)
print("Random external link is: "+externalLink)
followExternalOnly(externalLink)
followExternalOnly("http://oreilly.com")
allExtLinks = set()
allIntLinks = set()
def getAllExternalLinks(siteUrl):
html = urlopen(siteUrl)
domain = urlparse(siteUrl).scheme+"://"+urlparse(siteUrl).netloc
bsObj = BeautifulSoup(html, "html.parser")
internalLinks = getInternalLinks(bsObj,domain)
externalLinks = getExternalLinks(bsObj,domain)
for link in externalLinks:
if link not in allExtLinks:
allExtLinks.add(link)
print(link)
for link in internalLinks:
if link not in allIntLinks:
allIntLinks.add(link)
getAllExternalLinks(link)
followExternalOnly("http://oreilly.com")
allIntLinks.add("http://oreilly.com")
getAllExternalLinks("http://oreilly.com")
from scrapy.selector import Selector
from scrapy import Spider
from typapa.typapa.items import Article
class ArticleSpider(Spider):
name="article"
allowed_domains=["en.wikipedia.org"]
start_urls=["http://en.wikipedia.org/wiki/Main_Page",
"http://en.wikipedia.org/wiki/Python_%28programming_language%29"]
def parse(delf,response):
item=Article()
title=response.xpath("//h1/text()")[0].extract()
print("Title is: "+title)
item["title"]=title
return item
解析JSON数据
import json
from urllib.request import urlopen
def getCountry(ipAddress):
response=urlopen("http://freegeoip.net/json/"+ipAddress).read().decode("utf-8")
responseJson=json.loads(response)
return responseJson.get("country_code")
print (getCountry("50.78.253.58"))
import json
jsonString='{"array0fNums":[{"number":0},{"number":1},{"number":2}],"array0fFruits":[{"fruit":"apple"},{"fruit":"banana"},{"fruit":"pear"}]}'
json0bj=json.loads(jsonString)
print(json0bj.get("array0fNums"))
print(json0bj.get("array0fNums")[1])
print(json0bj.get("array0fNums")[1].get("number")+
json0bj.get("array0fNums")[2].get("number"))
print(json0bj.get("array0fFruits")[2].get("fruit"))
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random
random.seed(datetime.datetime.now())
def getLinks(articleUrl):
html=urlopen("http://en.wikipedia.org"+articleUrl)
bs0bj=BeautifulSoup(html)
return bs0bj.find("div",{"id":"bodyContent"}).findAll("a",{"href":re.compile("^(\/wiki\/)((?!:).)*$")})
def getHistoryIPs(pageUrl):
pageUrl=pageUrl.replace("/wiki/","")
historyUrl = "http://en.wikipedia.org/w/index.php?title="+pageUrl+"&action=history"
print("history url is: "+historyUrl)
html=urlopen(historyUrl)
bs0bj=BeautifulSoup(html)
Addresses=bs0bj.findAll("a",{"class":"mw-anonuserLink"})
addressList=set()
for Address in Addresses:
addressList.add(Address)
return addressList
def getCountry(ipAddress):
try:
response = urlopen("http://freegeoip.net/json/"+ipAddress).read().decode('utf-8')
except HTTPError:
return None
responseJson = json.loads(response)
return responseJson["region_name"]
links = getLinks("/wiki/Python_(programming_language)")
while(len(links) > 0):
for link in links:
print("-------------------")
historyIPs = getHistoryIPs(link.attrs["href"])
for historyIP in historyIPs:
country = getCountry(historyIP)
if country is not None:
print(historyIP+" is from "+country)
newLink = links[random.randint(0, len(links)-1)].attrs["href"]
links = getLinks(newLink)
存储数据
媒体文件
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
html=urlopen("http://www.pythonscraping.com/")
bs0bj=BeautifulSoup(html)
imageLocation=bs0bj.find("a",{"id":"logo"}).find("img").attrs["src"]
urlretrieve(imageLocation,"logo.jpg")
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
html=urlopen("http://www.youxiake.com/")
bs0bj=BeautifulSoup(html)
imageLocation=bs0bj.find("a",{"href":"http://www.youxiake.com/hotel/yunhe"}).find("img").attrs["src"]
print(imageLocation)
urlretrieve(imageLocation,"logo1.jpg")
把数据存储到csv
import csv
csvFile=open("C:/Users/dell-pc/Documents/Python Scripts/testa.csv","wt", newline ="")
try:
writer=csv.writer(csvFile)
writer.writerow(("number","number plus 2","number times 2"))
for i in range(10):
writer.writerow((i,i+2,i*2))
finally:
csvFile.close
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
html=urlopen("https://en.wikipedia.org/wiki/Comparison_of_text_editors")
bj0bs=BeautifulSoup(html)
table=bj0bs.findAll("table",{"class":"wikitable"})[0]
rows=table.findAll("tr")
csvFile=open("C:/Users/dell-pc/Documents/Python Scripts/test1.csv","wt",newline="",encoding="utf-8")
writer=csv.writer(csvFile)
try:
for row in rows:
csvRow=[]
for cell in row.findAll(["td","th"]):
csvRow.append(cell.get_text())
writer.writerow(csvRow)
finally:
csvFile.close()