#-*-coding:utf-8-*- import requests from bs4 import BeautifulSoup as bs import re def getcontent(url): r = requests.get(url) r.encoding = "GBK" return r.text def gethtml(content,www='None'): wwwurl = [] soup = bs(content, 'html.parser') for link in soup.findAll(name='a',attrs={'class':'ulink'}): newurl = str(www) + link.get('href') wwwurl.append(newurl.encode('utf-8')) return wwwurl def getftp(content): soup = bs(content,'html.parser') #print "title: " + soup.title.string #print "img: " + soup.findAll(name='img',attrs={'alt':"",'border':"0"})[0].get('src') for link in soup.findAll(href=re.compile("ftp://")): ftplink = link.get('href').encode('utf-8') #print ftplink return ftplink #get html infomation url = 'http://www.dytt8.net/html/gndy/dyzz/index.html' www = '/'.join(url.split('/')[0:3]) content = getcontent(url) newurl = gethtml(content, www) #get ftp download infomation urltotal = [] for i in newurl: ct = getcontent(i) urltotal.append(getftp(ct)) print '\n'.join(urltotal)