假期充电--爬虫第一天

爬虫第一天从内链外链学起,上一个简单的获取内链外链程序,现阶段的疑惑是我认为这个程序是以外链作为内链节节寻找,不知道这样的意义在哪里。

from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import random
import datetime
import re
pages = set()
random.seed(datetime.datetime.now())
#获取页面内部链接
def getInternalLinks(bsObj,includeUrl):
    includeUrl = urlparse(includeUrl).scheme+"://"+urlparse(includeUrl).netloc
    internalLinks = []
    for link in bsObj.findAll("a",href=re.compile("^(/|.*"+includeUrl+")")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.href['href'].startswith("/")):
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks

def getExtrenalLinks(bsObj,excludeurl):
    extrenalLinks=[]
    #查找http开头和www开头的域名
    for link in bsObj.findAll("a",href =re.compile("^(http|www)((?!"+excludeurl+").)*$")):
        if link.attrs['href'] is not None:
            #如果内连接包含跳转到其他页面的链接
            if link.attrs['href'] not in extrenalLinks:
                    extrenalLinks.append(link.attrs['href'])
    return extrenalLinks

def getRandomExtrnalLink(startingPage):
    html=urlopen(startingPage)
    bsObj= BeautifulSoup(html,"html.parser")
    extrenalLinks = getExtrenalLinks(bsObj,urlparse(startingPage).netloc)
    if len(extrenalLinks)==0:
        print("没有找到外链")
        domain =urlparse(html).scheme+"://"+urlparse(startingPage).netloc
        internalLinks=getInternalLinks(bsObj,domain)
        return getRandomExtrnalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
    else:
        return  extrenalLinks[random.randint(0,len(extrenalLinks)-1)]

def followExtrenalOnly(startingPage):
    externalLink =getRandomExtrnalLink(startingPage)
    #externalLink = "https://en.wikipedia.org/wiki/Intelligence_agency"
    print("Random extranal link is"+externalLink)
    followExtrenalOnly(externalLink)

# def main():
#     followExtrenalOnly("http://en.wikipedia.org")
#     print('End')
#     if __name__ == '__main__':
#         main()
followExtrenalOnly("https://en.wikipedia.org/wiki/Main_Page")

(作者:语落心生 链接:https://www.jianshu.com/p/ec0cbe424353)

正则表达式.png

Ctrl+Shift+I google开发者工具
开始像股票分析方面靠近:

from urllib.request import urlopen as uu
import re

url=["http://fund.eastmoney.com/000051.html",
     "http://fund.eastmoney.com/213008.html",
     "http://fund.eastmoney.com/000173.html",
     "http://fund.eastmoney.com/000477.html"]

find_re = re.compile(r'
(.+?)',re.DOTALL) html_re = re.compile(r'http://fund.eastmoney.com/(.+?).html',re.DOTALL) time_re = re.compile(r'(.+?)',re.DOTALL) for ul in url: html=uu(ul).read() html=html.decode('utf-8')#python3 print ("基金代码:" + str(html_re.findall(ul))) print ("单位净值:" + str(find_re.findall(html))) print ("最后更新时间:" + str(time_re.findall(html))) print ('')
运行结果.png

这是一个用正则表达式爬取基金简单信息的程序,没有动态没用beautufulsoup。接下来先看看需要分析什么,可能要涉及机器学习,最终目的是有一些统计意义的小程序。

继续撸知乎找灵感!

from urllib.request import urlopen
from bs4 import BeautifulSoup

quote_page = 'http://www.bloomberg.com/quote/SPX:IND'
page = urlopen(quote_page)
soup = BeautifulSoup(page, "html.parser")
name_box = soup.find("h1", attrs={"class": "name"})
name = name_box.text.strip() # strip() is used to remove starting and trailing
print (name)

price_box = soup.find("div", attrs={"class":"price"})
price = price_box.text
print (price)

这是一段爬bloomberg上静态信息的美丽soup,接下来要知道做什么才要紧!

你可能感兴趣的:(假期充电--爬虫第一天)