python转载[Rsync Algorithm]

 

转自:http://code.activestate.com/recipes/577518-rsync-algorithm/?in=lang-python

 

代码:Tested in Python 2.5, 2.6, and 3.1. In 2.7, io.BufferedReader should yield the best throughput. On all other versions use __builtin__.open.

# !/usr/bin/env python
#
 -*- coding: utf-8 -*-
"""
This is a pure Python implementation of the [rsync algorithm](TM96).

[TM96] Andrew Tridgell and Paul Mackerras. The rsync algorithm.
Technical Report TR-CS-96-05, Canberra 0200 ACT, Australia, 1996.
http://samba.anu.edu.au/rsync/.

### Example Use Case: ###

    # On the system containing the file that needs to be patched
    >>> unpatched = open("unpatched.file", "rb")
    >>> hashes = blockchecksums(unpatched)

    # On the remote system after having received `hashes`
    >>> patchedfile = open("patched.file", "rb")
    >>> delta = rsyncdelta(patchedfile, hashes)

    # System with the unpatched file after receiving `delta`
    >>> unpatched.seek(0)
    >>> save_to = open("locally-patched.file", "wb")
    >>> patchstream(unpatched, save_to, delta)
"""

import  collections
import  hashlib

if   not (hasattr( __builtins__ " bytes " ))  or  str  is  bytes:
    
#  Python 2.x compatibility
     def  bytes(var,  * args):
        
try :
            
return   '' .join(map(chr, var))
        
except  TypeError:
            
return  map(ord, var)

__all__   =  [ " rollingchecksum " " weakchecksum " " patchstream " " rsyncdelta " ,
    
" blockchecksums " ]


def  rsyncdelta(datastream, remotesignatures, blocksize = 4096 ):
    
"""
    Generates a binary patch when supplied with the weak and strong
    hashes from an unpatched target and a readable stream for the
    up-to-date data. The blocksize must be the same as the value
    used to generate remotesignatures.
    
"""
    remote_weak, remote_strong 
=  remotesignatures

    match 
=  True
    matchblock 
=   - 1
    deltaqueue 
=  collections.deque()

    
while  True:
        
if  match  and  datastream  is   not  None:
            
#  Whenever there is a match or the loop is running for the first
             #  time, populate the window using weakchecksum instead of rolling
             #  through every single byte which takes at least twice as long.
            window  =  collections.deque(bytes(datastream.read(blocksize)))
            checksum, a, b 
=  weakchecksum(window)

        
try :
            
#  If there are two identical weak checksums in a file, and the
             #  matching strong hash does not occur at the first match, it will
             #  be missed and the data sent over. May fix eventually, but this
             #  problem arises very rarely.
            matchblock  =  remote_weak.index(checksum, matchblock  +   1 )
            stronghash 
=  hashlib.md5(bytes(window)).hexdigest()
            matchblock 
=  remote_strong.index(stronghash, matchblock)

            match 
=  True
            deltaqueue.append(matchblock)

            
if  datastream.closed:
                
break
            
continue

        
except  ValueError:
            
#  The weakchecksum did not match
            match  =  False
            
try :
                
if  datastream:
                    
#  Get the next byte and affix to the window
                    newbyte  =  ord(datastream.read( 1 ))
                    window.append(newbyte)
            
except  TypeError:
                
#  No more data from the file; the window will slowly shrink.
                 #  newbyte needs to be zero from here on to keep the checksum
                 #  correct.
                newbyte  =  0
                tailsize 
=  datastream.tell()  %  blocksize
                datastream 
=  None

            
if  datastream  is  None  and  len(window)  <=  tailsize:
                
#  The likelihood that any blocks will match after this is
                 #  nearly nil so call it quits.
                deltaqueue.append(window)
                
break

            
#  Yank off the extra byte and calculate the new window checksum
            oldbyte  =  window.popleft()
            checksum, a, b 
=  rollingchecksum(oldbyte, newbyte, a, b, blocksize)

            
#  Add the old byte the file delta. This is data that was not found
             #  inside of a matching block so it needs to be sent to the target.
             try :
                deltaqueue[
- 1 ].append(oldbyte)
            
except  (AttributeError, IndexError):
                deltaqueue.append([oldbyte])

    
#  Return a delta that starts with the blocksize and converts all iterables
     #  to bytes.
    deltastructure  =  [blocksize]
    
for  element  in  deltaqueue:
        
if  isinstance(element, int):
            deltastructure.append(element)
        
elif  element:
            deltastructure.append(bytes(element))

    
return  deltastructure


def  blockchecksums(instream, blocksize = 4096 ):
    
"""
    Returns a list of weak and strong hashes for each block of the
    defined size for the given data stream.
    
"""
    weakhashes 
=  list()
    stronghashes 
=  list()
    read 
=  instream.read(blocksize)

    
while  read:
        weakhashes.append(weakchecksum(bytes(read))[0])
        stronghashes.append(hashlib.md5(read).hexdigest())
        read 
=  instream.read(blocksize)

    
return  weakhashes, stronghashes


def  patchstream(instream, outstream, delta):
    
"""
    Patches instream using the supplied delta and write the resultantant
    data to outstream.
    
"""
    blocksize 
=  delta[0]

    
for  element  in  delta[ 1 :]:
        
if  isinstance(element, int)  and  blocksize:
            instream.seek(element 
*  blocksize)
            element 
=  instream.read(blocksize)
        outstream.write(element)


def  rollingchecksum(removed, new, a, b, blocksize = 4096 ):
    
"""
    Generates a new weak checksum when supplied with the internal state
    of the checksum calculation for the previous window, the removed
    byte, and the added byte.
    
"""
    a 
-=  removed  -  new
    b 
-=  removed  *  blocksize  -  a
    
return  (b  <<   16 |  a, a, b


def  weakchecksum(data):
    
"""
    Generates a weak checksum from an iterable set of bytes.
    
"""
    a 
=  b  =  0
    l 
=  len(data)
    
for  i  in  range(l):
        a 
+=  data[i]
        b 
+=  (l  -  i) * data[i]

    
return  (b  <<   16 |  a, a, b

 

 

测试:

# On the system containing the file that needs to be patched 
>>> unpatched = open("unpatched.file", "rb") 
>>> hashes = blockchecksums(unpatched) 
 
# On the remote system after having received `hashes` 
>>> patchedfile = open("patched.file", "rb") 
>>> delta = rsyncdelta(patchedfile, hashes) 
 
# System with the unpatched file after receiving `delta` 
>>> unpatched.seek(0) 
>>> save_to = open("locally-patched.file", "wb") 
>>> patchstream(unpatched, save_to, delta) 

 

rsync算法:http://www.cnblogs.com/itech/archive/2010/06/13/1757952.html

 

完!


 

你可能感兴趣的:(python转载[Rsync Algorithm])