这是一个静态页面爬虫学习

import warnings
warnings.filterwarnings("ignore")

初见网络爬虫

调用 BeautifulSoup对象

from urllib.request import urlopen
from bs4 import BeautifulSoup

html=urlopen("http://pythonscraping.com/pages/page1.html")
bs0bj=BeautifulSoup(html.read())

print(bs0bj)
print(bs0bj.h1) 
#用print(bs0bj.h1)输出

中的内容

复杂的HTML解析

findAll()

from urllib.request import urlopen
from bs4 import BeautifulSoup

html=urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bs0bj=BeautifulSoup(html)
#创建一个beautifulsoup对象

namelist=bs0bj.findAll("span",{"class":"green"})
for name in namelist:
    print(name.get_text())
#取出span标签,用 `.get_text()`函数会清楚所有的标签,只包含文字的字符串

#namelist1=bs0bj.findAll(text="the price")

BeautifulSoup的find()和findAll()

from urllib.request import urlopen
from bs4 import BeautifulSoup 

html=urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bs0bj=BeautifulSoup(html)

allText=bs0bj.findAll("",{"id":"text"})
for i in allText:
    print(i.get_text())

#findAll(tag,attributes,recursive,text,limit,keyboards)
#find(tag,attributes,recursive,text,keyboards)  #find默认limit=1
#后三行等同于用keyword:
#allText=bs0bj.findAll(id="text")
#print(allText[0].get_text())

处理子标签(.children)

#BeautifulSoup 函数总是处理当前的“后代标签”
from urllib.request import urlopen
from bs4 import BeautifulSoup 

html=urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj= BeautifulSoup(html)

for child in bs0bj.find("table",{"id":"giftList"}).children:
    print(child)

处理兄弟标签(.next_siblings)`

from urllib.request import urlopen
from bs4 import BeautifulSoup 

html=urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj= BeautifulSoup(html)

for child in bs0bj.find("table",{"id":"giftList"}).tr.next_siblings:
    print(sibling)

#next_sibling和previous_sibling函数作用类似,返回的是单个标签,而不是一组标签

父标签处理()

from urllib.request import urlopen
from bs4 import BeautifulSoup 

html=urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj= BeautifulSoup(html)

print(bs0bj.find("img",{"src":"../img/gifts/img1.jpg"
                       }).parent.previous_sibling.get_text())

正则表达式和BeautifulSoup

from urllib.request import urlopen
from bs4 import BeautifulSoup 
import re

html=urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj= BeautifulSoup(html)

images=bs0bj.findAll("img",{"src":re.compile("\.\.\/img\/gifts\/img.*\.jpg")})
for image in images:
    print(image["src"])

lambda表达式

#soup.findAll(lambda tag:len(tag.attrs)==2)

#上式中:
#myTag.attrs可获得全部属性

开始采集

遍历单个域名

from urllib.request import urlopen
from bs4 import BeautifulSoup

html=urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bs0bj=BeautifulSoup(html)

for line in bs0bj.findAll("a"):
    if "href" in line.attrs:
        print(line.attrs["href"])
#只看指向词条页面的链接(不看侧边栏、页眉、页脚)

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

html=urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bs0bj=BeautifulSoup(html)

for line in bs0bj.find("div",{"id":"bodyContent"}).findAll("a",{"href":re.compile("^\/wiki\/((?!:).)*$")}):
    if "href" in line.attrs:
        print(line.attrs["href"])

#此处前者必须用find,后者必须用findAll,原因未知。如有知情者欢迎帮助我解答。
#并且findAll才与line.attrs对应,有输出
#由此,findAll才是王道啊
#设置随机数,利用函数获取新页面的函数链接

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import random
import datetime

random.seed(datetime.datetime.now())
def getlines(articleUrl):
    html=urlopen("http://en.wikipedia.org"+articleUrl)
    bs0bj=BeautifulSoup(html)
    return bs0bj.find("div",{"id":"bodyContent"}).findAll("a",{"href":re.compile("^\/wiki\/((?!:).)*$")})

links=getlines("/wiki/Kevin_Bacon")
print(links)
while len(links)>0:
    newArticle=links[random.randint(0,len(links)-1)].attrs["href"]
    print(newArticle)
    links=getlines(newArticle)

采集整个网站

#为了避免一个页面被采集两次,需要链接去重

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages=set()
def getlinks(pageUrl):
    global pages
    html=urlopen("http://en.wikipedia.org"+ pageUrl)
    bs0bj=BeautifulSoup(html)
    for link in bs0bj.findAll("a",{"href":re.compile("^(\/wiki\/)")}):
        if "href" in link.attrs:
            if link.attrs["href"] not in pages:
                newpage=link.attrs["href"]
                print(newpage)
                pages.add(newpage)
                getlinks(newpage)
getlinks(" ")

#global 为一个定义在函数外的变量赋值
#收集整个网站数据

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages=set()
def getlinks(pageUrl):
    global pages
    html=urlopen("http://en.wikipedia.org"+ pageUrl)
    bs0bj=BeautifulSoup(html)
    try:
        print(bs0bj.h1.get_text())
        print(bs0bj.find(id="mw-content-text").findAll("p")[0])
        print(bs0bj.find(id="ca-edit").find("span").find("a").attr["href"])
    except AttributeError:
        print("页面缺少一些属性!不过不用担心")
    for link in bs0bj.findAll("a",{"href":re.compile("^(\/wiki\/)")}):
        if "href" in link.attrs:
            if link.attrs["href"] not in pages:
                newpage=link.attrs["href"]
                print(newpage)
                pages.add(newpage)
                getlinks(newpage)
getlinks(" ")

#每个打印语句都是按照数据在页面上出现的可能性
#得到随机外链,若没外链则换

from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random

pages = set()
random.seed(datetime.datetime.now())

#Retrieves a list of all Internal links found on a page
def getInternalLinks(bsObj, includeUrl):
    includeUrl = urlparse(includeUrl).scheme+"://"+urlparse(includeUrl).netloc
    internalLinks = []
    #Finds all links that begin with a "/"
    for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.attrs['href'].startswith("/")):
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks

#Retrieves a list of all external links found on a page
def getExternalLinks(bsObj, excludeUrl):
    externalLinks = []
    #Finds all links that start with "http" or "www" that do
    #not contain the current URL
    for link in bsObj.findAll("a", href=re.compile(
                                "^(http|www)((?!"+excludeUrl+").)*$")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks

def getRandomExternalLink(startingPage):
    html = urlopen(startingPage)
    bsObj = BeautifulSoup(html,"html.parser")
    externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
    if len(externalLinks) == 0:
        print("No external links, looking around the site for one")
        domain = urlparse(startingPage).scheme+"://"+urlparse(startingPage).netloc
        internalLinks = getInternalLinks(bsObj, domain)
        return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]

def followExternalOnly(startingSite):
    externalLink = getRandomExternalLink(startingSite)
    print("Random external link is: "+externalLink)
    followExternalOnly(externalLink)

followExternalOnly("http://oreilly.com")

#bsObj = BeautifulSoup(html,"html.parser")
#后面的"html.parser"可以省略


#采集一个网站所有的外链
allExtLinks = set()
allIntLinks = set()
def getAllExternalLinks(siteUrl):
    html = urlopen(siteUrl)
    domain = urlparse(siteUrl).scheme+"://"+urlparse(siteUrl).netloc
    bsObj = BeautifulSoup(html, "html.parser")
    internalLinks = getInternalLinks(bsObj,domain)
    externalLinks = getExternalLinks(bsObj,domain)

    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            print(link)
    for link in internalLinks:
        if link not in allIntLinks:
            allIntLinks.add(link)
            getAllExternalLinks(link)

followExternalOnly("http://oreilly.com")

allIntLinks.add("http://oreilly.com")
getAllExternalLinks("http://oreilly.com")
#用scrapy爬。。这一块太强大,暂时放弃

from scrapy.selector import Selector
from scrapy import Spider
from typapa.typapa.items import Article

class ArticleSpider(Spider):
    name="article"
    allowed_domains=["en.wikipedia.org"]
    start_urls=["http://en.wikipedia.org/wiki/Main_Page",
                "http://en.wikipedia.org/wiki/Python_%28programming_language%29"]

    def parse(delf,response):
        item=Article()
        title=response.xpath("//h1/text()")[0].extract()
        print("Title is: "+title)
        item["title"]=title
        return item

解析JSON数据

#获取ip地址属于哪个国家

import json 
from urllib.request import urlopen

def getCountry(ipAddress):
    response=urlopen("http://freegeoip.net/json/"+ipAddress).read().decode("utf-8")
    responseJson=json.loads(response)
    return responseJson.get("country_code")
print (getCountry("50.78.253.58"))
import json
jsonString='{"array0fNums":[{"number":0},{"number":1},{"number":2}],"array0fFruits":[{"fruit":"apple"},{"fruit":"banana"},{"fruit":"pear"}]}'
json0bj=json.loads(jsonString)

print(json0bj.get("array0fNums"))
print(json0bj.get("array0fNums")[1])
print(json0bj.get("array0fNums")[1].get("number")+
     json0bj.get("array0fNums")[2].get("number"))
print(json0bj.get("array0fFruits")[2].get("fruit"))
#利用getLinks函数进行解析并找到需要的url
#利用getHistoryIPs函数取出历史网址中的ip地址
#利用getCountry函数得到ip地址对应的实际的国家

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re 
import datetime 
import random

random.seed(datetime.datetime.now())
def getLinks(articleUrl):
    html=urlopen("http://en.wikipedia.org"+articleUrl)
    bs0bj=BeautifulSoup(html)
    return bs0bj.find("div",{"id":"bodyContent"}).findAll("a",{"href":re.compile("^(\/wiki\/)((?!:).)*$")})

def getHistoryIPs(pageUrl):
    pageUrl=pageUrl.replace("/wiki/","")
    historyUrl = "http://en.wikipedia.org/w/index.php?title="+pageUrl+"&action=history"
    print("history url is: "+historyUrl)
    html=urlopen(historyUrl)
    bs0bj=BeautifulSoup(html)
    Addresses=bs0bj.findAll("a",{"class":"mw-anonuserLink"})
    addressList=set()
    for Address in Addresses:
        addressList.add(Address)
    return addressList

def getCountry(ipAddress):
    try:
        response = urlopen("http://freegeoip.net/json/"+ipAddress).read().decode('utf-8')
    except HTTPError:
        return None
    responseJson = json.loads(response)
    return responseJson["region_name"]

links = getLinks("/wiki/Python_(programming_language)")


while(len(links) > 0):
    for link in links:
        print("-------------------") 
        historyIPs = getHistoryIPs(link.attrs["href"])
        for historyIP in historyIPs:
            country = getCountry(historyIP)
            if country is not None:
                print(historyIP+" is from "+country)
    newLink = links[random.randint(0, len(links)-1)].attrs["href"]
    links = getLinks(newLink)

存储数据

媒体文件

from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup

html=urlopen("http://www.pythonscraping.com/")
bs0bj=BeautifulSoup(html)
imageLocation=bs0bj.find("a",{"id":"logo"}).find("img").attrs["src"]
urlretrieve(imageLocation,"logo.jpg")
#哈哈哈,尝试下了一张游侠客图片

from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup

html=urlopen("http://www.youxiake.com/")
bs0bj=BeautifulSoup(html)
imageLocation=bs0bj.find("a",{"href":"http://www.youxiake.com/hotel/yunhe"}).find("img").attrs["src"]
print(imageLocation)
urlretrieve(imageLocation,"logo1.jpg")

把数据存储到csv

import csv

csvFile=open("C:/Users/dell-pc/Documents/Python Scripts/testa.csv","wt", newline ="")
try:
    writer=csv.writer(csvFile)
    writer.writerow(("number","number plus 2","number times 2"))
    for i in range(10):
        writer.writerow((i,i+2,i*2))
finally:
    csvFile.close

#`newline=""`可去除空行
#获取html表格并写入csv文件

from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv

html=urlopen("https://en.wikipedia.org/wiki/Comparison_of_text_editors")
bj0bs=BeautifulSoup(html)
table=bj0bs.findAll("table",{"class":"wikitable"})[0]
rows=table.findAll("tr")

csvFile=open("C:/Users/dell-pc/Documents/Python Scripts/test1.csv","wt",newline="",encoding="utf-8")
writer=csv.writer(csvFile)
#上两行为导入时必须写

try:
    for row in rows:
        csvRow=[]
        for cell in row.findAll(["td","th"]):
            csvRow.append(cell.get_text())
        writer.writerow(csvRow)
        #上行为导入一行
finally:
    csvFile.close()

你可能感兴趣的:(python爬虫,爬虫,网络爬虫,python)