Python 网络爬虫源码(抓取源视频)

样本

网络来源

作者: hehao

原文抓取linux520网站的渗透测试视频,无意侵犯linux520网站权益。

 

源码分享学习

#!/usr/bin/env python

# -*- coding: UTF-8 -*-

#version 0.1

#author:hehao

#python version:2.7.2

#需要安装psutil库

#

from os.path import basename

from urlparse import urlsplit

import os

import urllib2

import sys

try:

import psutil

except:

print "please install psutil ex:pypm install psutil"

sys.exit()

import re

def url2name(url):

return basename(urlsplit(url)[2])

#下载指定的文件

def download(url, localFileName = None):

localName = url2name(url)

req = urllib2.Request(url)

r = urllib2.urlopen(req)

if r.info().has_key('Content-Disposition'):

# If the response has Content-Disposition, we take file name from it

localName = r.info()['Content-Disposition'].split('filename=')[1]

if localName[0] == '"' or localName[0] == "'":

localName = localName[1:-1]

elif r.url != url:

# if we were redirected, the real file name we take from the final URL

localName = url2name(r.url)

if localFileName:

# we can force to save the file as specified name

localName = localFileName

f = open(localName, 'wb')

f.write(r.read())

f.close()

#通过进程名获取进程ID

def getpid(process_name):

p_list=psutil.get_process_list()

for x in p_list:

if process_name in str(x):

return x.pid

else:

return 0

#杀死指定进程ID

def killpid(pid):

p_kill=psutil.Process(pid)

try:

p_kill.kill()

except:

return 0

#使用swfdump对flash文件进行分析,并提取真实的文件名

def analy_swf(swf_path):

a=os.popen(r"swfdump.exe -a "+swf_path)

for x in a:

real_name=re.findall(r"""<uri>([\S\s]*?)</uri>""",x)

if len(real_name)>0:

return real_name[0]

killpid(getpid("swfdump.exe"))

#下载真实的视频文件

def download_realvideo(swf_url,url,id):

download(swf_url,'tmp.swf')

r_name=analy_swf("tmp.swf")

download(url+r_name,str(id)+"_"+r_name)

os.remove('tmp.swf')

#url="#该url不公开(视频地址)"

#u="该url不公开(地址)"

#download_realvideo(url,u,138)

url="该url不公开(地址)"

for x in range(1,200):

u=url+str(x)+"/"

print u

try:

a=urllib2.urlopen(urllib2.Request(u)).read()

except:

continue

try:

s=re.findall(r"""<param name\=\"src\"\svalue\=\"(.*?)\"\/>""",a)[0] #正则提取swf

except:

continue

if '#' in s:

try:

real_name=re.findall(r"""<uri>([\S\s]*?)</uri>""",urllib2.urlopen(urllib2.Request(u+s.split('_')[0]+'_config.xml')).read())

download(u+real_name[0],str(x)+'_'+real_name[0])

except:

continue

elif 'swf' in s:

try:

download_realvideo(u+s,u,x)

except:

continue

else:

try:

download(u+s,str(x)+'_'+s)

except:

continue

你可能感兴趣的:(网络,视频,python,version)