02 爬虫爬取网页,探测网页变化,追踪github话题热度

简单爬网页

from bs4 import BeautifulSoup
import requests

url  = 'https://knewone.com/?page=2'
wb_data  = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')

imgs = soup.select('article > header > a > img') #wrapper > ul > li:nth-child(39) > article > header > a > img
titles = soup.select('article > section > h4 > a') #wrapper > ul > li:nth-child(39) > article > section > h4 > a
links =  soup.select('article > section > h4 > a')  #wrapper > ul > li:nth-child(39) > article > section > h4 > a

for img, title, link in zip(imgs, titles, links):
    data = {
        'img': img.get('src'),
        'title' : title.get('title'),
        'link':  'https://knewone.com/' + link.get('href')
    }
    print(data)

如果是动态异步加载的网页,需要审查元素点network,然后其XHS里面,你再加载信息,就可以得到尾缀了。

假设我们想要探测如下网页的变化,看看作者有没有更新。首先,网页地址:

https://github.com/lennylxx/ipv6-hosts
截图:

截图

对应的api为:https://api.github.com/repos/lennylxx/ipv6-hosts
打开以后会有如下的JSON代码,很像python里面的字典:

{
  "id": 21858929,
  "node_id": "MDEwOlJlcG9zaXRvcnkyMTg1ODkyOQ==",
  "name": "ipv6-hosts",
  "full_name": "lennylxx/ipv6-hosts",
  "owner": {
    "login": "lennylxx",
    "id": 5811576,
    "node_id": "MDQ6VXNlcjU4MTE1NzY=",
    "avatar_url": "https://avatars3.githubusercontent.com/u/5811576?v=4",
    "gravatar_id": "",
    "url": "https://api.github.com/users/lennylxx",
    "html_url": "https://github.com/lennylxx",
    "followers_url": "https://api.github.com/users/lennylxx/followers",
    "following_url": "https://api.github.com/users/lennylxx/following{/other_user}",
    "gists_url": "https://api.github.com/users/lennylxx/gists{/gist_id}",
    "starred_url": "https://api.github.com/users/lennylxx/starred{/owner}{/repo}",
    "subscriptions_url": "https://api.github.com/users/lennylxx/subscriptions",
    "organizations_url": "https://api.github.com/users/lennylxx/orgs",
    "repos_url": "https://api.github.com/users/lennylxx/repos",
    "events_url": "https://api.github.com/users/lennylxx/events{/privacy}",
    "received_events_url": "https://api.github.com/users/lennylxx/received_events",
    "type": "User",
    "site_admin": false
  },
  "private": false,
  "html_url": "https://github.com/lennylxx/ipv6-hosts",
  "description": null,
  "fork": false,
  "url": "https://api.github.com/repos/lennylxx/ipv6-hosts",
  "forks_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/forks",
  "keys_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/keys{/key_id}",
  "collaborators_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/collaborators{/collaborator}",
  "teams_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/teams",
  "hooks_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/hooks",
  "issue_events_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/issues/events{/number}",
  "events_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/events",
  "assignees_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/assignees{/user}",
  "branches_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/branches{/branch}",
  "tags_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/tags",
  "blobs_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/git/blobs{/sha}",
  "git_tags_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/git/tags{/sha}",
  "git_refs_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/git/refs{/sha}",
  "trees_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/git/trees{/sha}",
  "statuses_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/statuses/{sha}",
  "languages_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/languages",
  "stargazers_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/stargazers",
  "contributors_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/contributors",
  "subscribers_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/subscribers",
  "subscription_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/subscription",
  "commits_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/commits{/sha}",
  "git_commits_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/git/commits{/sha}",
  "comments_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/comments{/number}",
  "issue_comment_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/issues/comments{/number}",
  "contents_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/contents/{+path}",
  "compare_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/compare/{base}...{head}",
  "merges_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/merges",
  "archive_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/{archive_format}{/ref}",
  "downloads_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/downloads",
  "issues_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/issues{/number}",
  "pulls_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/pulls{/number}",
  "milestones_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/milestones{/number}",
  "notifications_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/notifications{?since,all,participating}",
  "labels_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/labels{/name}",
  "releases_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/releases{/id}",
  "deployments_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/deployments",
  "created_at": "2014-07-15T12:36:53Z",
  "updated_at": "2018-07-04T07:31:08Z",
  "pushed_at": "2018-06-22T01:57:04Z",
  "git_url": "git://github.com/lennylxx/ipv6-hosts.git",
  "ssh_url": "[email protected]:lennylxx/ipv6-hosts.git",
  "clone_url": "https://github.com/lennylxx/ipv6-hosts.git",
  "svn_url": "https://github.com/lennylxx/ipv6-hosts",
  "homepage": "",
  "size": 7345,
  "stargazers_count": 2858,
  "watchers_count": 2858,
  "language": "Python",
  "has_issues": true,
  "has_projects": true,
  "has_downloads": true,
  "has_wiki": true,
  "has_pages": false,
  "forks_count": 861,
  "mirror_url": null,
  "archived": false,
  "open_issues_count": 12,
  "license": {
    "key": "mit",
    "name": "MIT License",
    "spdx_id": "MIT",
    "url": "https://api.github.com/licenses/mit",
    "node_id": "MDc6TGljZW5zZTEz"
  },
  "forks": 861,
  "open_issues": 12,
  "watchers": 2858,
  "default_branch": "master",
  "network_count": 861,
  "subscribers_count": 313
}

更新时间在哪里?
在上述JSON文件里,标注了"updated_at": "2018-07-04T07:31:08Z",,这就是更新时间
如果想要看网页是否变化,就对更新时间进行检测即可。

import requests
import time

api = 'https://api.github.com/users/kennethreitz/starred'
web_page = 'https://github.com/kennethreitz'
last_update = None
all_info = requests.get(api).json()
cur_update = all_info['updated_at']
print(cur_update)
while True:
    if not last_update:
        last_update = cur_update

    if last_update < cur_update:
        webbrowser.open(webpage)
    time.sleep(600)

对比几个热门库的热度

这里可以使用这里的api,现成的:https://developer.github.com/v3/search/,我用的是q。
以django为例,https://api.github.com/search/repositories?q=django,这是django相关的项目,api有一个好处,那就是简单,json呈现。python中有.json()方法,可以使得json转化为python的字典、列表等等。


再比如topic内容是Django的,都有现成的api可以用:https://api.github.com/search/repositories?q=topic:django

那么使用的时候应该这样去做:

#https://api.github.com/search/repositories?q=topic:django
#https://api.github.com/search/repositories?q=django

#get_names -- check_repos

import requests

def get_names():
    print('Separate each name with Space')
    names = input()
    return names.split()

def check_repos(names):
    repo_api = 'https://api.github.com/search/repositories?q='
    ecosys_api = 'https://api.github.com/search/repositories?q=topic:'
    for name in names:
        repo_info = requests.get(repo_api+name).json()['items'][0]  
#1/json - 2/dict - 3/dict['items'] - list[0] -- django{"name": "django","stargazers_count": 34961}
        stars = repo_info['stargazers_count']
        forks = repo_info['forks_count']
        ecosys_info = requests.get(ecosys_api+name).json()['total_count']
        print(name)
        print('Stars:'+str(stars))
        print('Forks:'+str(forks))
        print('Ecosys:'+str(ecosys_info))
        print('-------------------')

names  =get_names()
check_repos(names)

输出结果:

>>>Separate each name with Space
flask django sanic bottle
flask
Stars37174
Forks11015
Ecosys:6734
-------------------
django
Stars34965
Forks14861
Ecosys:10212
-------------------
sanic
Stars9640
Forks895
Ecosys:158
-------------------
bottle
Stars5528
Forks1125
Ecosys:117
-------------------

你可能感兴趣的:(02 爬虫爬取网页,探测网页变化,追踪github话题热度)