爬取github的explore应用和简要介绍

import requests
from bs4 import BeautifulSoup


def getNewMak(link, mark, applist, cnt):
    for app in link.find_all('div', class_='px-3 pt-1'):
        mark = mark + 1
        tmp = app.h3.text.strip()
        intro = app.p.text.strip()
        applist.append([cnt, mark, tmp, intro])


def getHTML(url):
    html = requests.get(url)
    return html.text


def getInfo(html, titlelist, applist):
    soup = BeautifulSoup(html, 'html.parser')
    cnt = 0
    tmp_url = 'https://github.com'
    links = soup.find_all('div', class_='mb-6 mb-md-0')
    for link in links:
        cnt = cnt + 1
        title = link.h2.text.strip()
        titlelist.append(title)
        mark = 0
        getNewMak(link, mark, applist, cnt)
        new_mark = link.a['href']
        new_url = tmp_url + str(new_mark)
        inside_html = getHTML(new_url)
        inside_soup = BeautifulSoup(inside_html, 'html.parser')
        getNewMak(inside_soup, mark, applist, cnt)


def Infile(titlelist, applist):
    modeule = "{0:^5}\t{1:{3}<30}\t{2:{3}<40}"
    start = 0
    for var in applist:
        if start != var[0]:
            print(titlelist[start])
            start += 1
        print(modeule.format(var[1], var[2], var[3], chr(12288)))


def main():
    applist = []
    titlelist = []
    url = 'https://github.com/marketplace'
    html_text = getHTML(url)
    getInfo(html_text, titlelist, applist)
    Infile(titlelist, applist)


if __name__ == '__main__':
    main()

你可能感兴趣的:(爬取github的explore应用和简要介绍)