Example 6-1. Exploring common OS module data methods In [1]: import os In [2]: os.getcwd() Out[2]: '/private/tmp' In [3]: os.mkdir("/tmp/os_mod_explore") In [4]: os.listdir("/tmp/os_mod_explore") Out[4]: [] In [5]: os.mkdir("/tmp/os_mod_explore/test_dir1") In [6]: os.listdir("/tmp/os_mod_explore") Out[6]: ['test_dir1'] In [7]: os.stat("/tmp/os_mod_explore") Out[7]: (16877, 6029306L, 234881026L, 3, 501, 0, 102L, 1207014425, 1207014398, 1207014398) In [8]: os.rename("/tmp/os_mod_explore/test_dir1", "/tmp/os_mod_explore/test_dir1_renamed") In [9]: os.listdir("/tmp/os_mod_explore") Out[9]: ['test_dir1_renamed'] In [10]: os.rmdir("/tmp/os_mod_explore/test_dir1_renamed") In [11]: os.rmdir("/tmp/os_mod_explore/") Example 6-2. Using the shutil module to copy a data tree In [1]: import os In [2]: os.chdir("/tmp") In [3]: os.makedirs("test/test_subdir1/test_subdir2") In [4]: ls -lR total 0 drwxr-xr-x 3 ngift wheel 102 Mar 31 22:27 test/ ./test: total 0 drwxr-xr-x 3 ngift wheel 102 Mar 31 22:27 test_subdir1/ ./test/test_subdir1: total 0 drwxr-xr-x 2 ngift wheel 68 Mar 31 22:27 test_subdir2/ ./test/test_subdir1/test_subdir2: In [5]: import shutil In [6]: shutil.copytree("test", "test-copy") In [19]: ls -lR total 0 drwxr-xr-x 3 ngift wheel 102 Mar 31 22:27 test/ drwxr-xr-x 3 ngift wheel 102 Mar 31 22:27 test-copy/ ./test: total 0 drwxr-xr-x 3 ngift wheel 102 Mar 31 22:27 test_subdir1/ ./test/test_subdir1: total 0 drwxr-xr-x 2 ngift wheel 68 Mar 31 22:27 test_subdir2/ ./test/test_subdir1/test_subdir2: ./test-copy: total 0 drwxr-xr-x 3 ngift wheel 102 Mar 31 22:27 test_subdir1/ ./test-copy/test_subdir1: total 0 drwxr-xr-x 2 ngift wheel 68 Mar 31 22:27 test_subdir2/ ./test-copy/test_subdir1/test_subdir2: Example 6-3. Moving a data tree with shutil In [20]: shutil.move("test-copy", "test-copy-moved") In [21]: ls -lR total 0 drwxr-xr-x 3 ngift wheel 102 Mar 31 22:27 test/ drwxr-xr-x 3 ngift wheel 102 Mar 31 22:27 test-copy-moved/ ./test: total 0 drwxr-xr-x 3 ngift wheel 102 Mar 31 22:27 test_subdir1/ ./test/test_subdir1: total 0 drwxr-xr-x 2 ngift wheel 68 Mar 31 22:27 test_subdir2/ ./test/test_subdir1/test_subdir2: ./test-copy-moved: total 0 drwxr-xr-x 3 ngift wheel 102 Mar 31 22:27 test_subdir1/ ./test-copy-moved/test_subdir1: total 0 drwxr-xr-x 2 ngift wheel 68 Mar 31 22:27 test_subdir2/ ./test-copy-moved/test_subdir1/test_subdir2: Example 6-4. Deleting a data tree with shutil In [22]: shutil.rmtree("test-copy-moved") In [23]: shutil.rmtree("test-copy") In [24]: ll Example 6-5. Verbose directory walking script import os path = "/tmp" def enumeratepaths(path=path): """Returns the path to all the files in a directory recursively""" path_collection = [] for dirpath, dirnames, filenames in os.walk(path): for file in filenames: fullpath = os.path.join(dirpath, file) path_collection.append(fullpath) return path_collection def enumeratefiles(path=path): """Returns all the files in a directory as a list""" file_collection = [] for dirpath, dirnames, filenames in os.walk(path): for file in filenames: file_collection.append(file) return file_collection def enumeratedir(path=path): """Returns all the directories in a directory as a list""" dir_collection = [] for dirpath, dirnames, filenames in os.walk(path): for dir in dirnames: dir_collection.append(dir) return dir_collection if __name__ == "__main__": print "\nRecursive listing of all paths in a dir:" for path in enumeratepaths(): print path print "\nRecursive listing of all files in dir:" for file in enumeratefiles(): print file print "\nRecursive listing of all dirs in dir:" for dir in enumeratedir(): print dir Example 6-6. Creating reusable directory walking module import os class diskwalk(object): """API for getting directory walking collections""" def __init__(self, path): self.path = path def enumeratePaths(self): """Returns the path to all the files in a directory as a list""" path_collection = [] for dirpath, dirnames, filenames in os.walk(self.path): for file in filenames: fullpath = os.path.join(dirpath, file) path_collection.append(fullpath) return path_collection def enumerateFiles(self): """Returns all the files in a directory as a list""" file_collection = [] for dirpath, dirnames, filenames in os.walk(self.path): for file in filenames: file_collection.append(file) return file_collection def enumerateDir(self): """Returns all the directories in a directory as a list""" dir_collection = [] for dirpath, dirnames, filenames in os.walk(self.path): for dir in dirnames: dir_collection.append(dir) return dir_collection Example 6-7. Performing an MD5 checksum on files import hashlib def create_checksum(path): """ Reads in file. Creates checksum of file line by line. Returns complete checksum total for file. """ fp = open(path) checksum = hashlib.md5() while True: buffer = fp.read(8192) if not buffer:break checksum.update(buffer) fp.close() checksum = checksum.digest() return checksum Example 6-8. Performing an MD5 checksum on a directory tree to find duplicates In [1]: from checksum import createChecksum In [2]: from diskwalk_api import diskwalk In [3]: d = diskwalk('/tmp/duplicates_directory') In [4]: files = d.enumeratePaths() In [5]: len(files) Out[5]: 12 In [6]: dup = [] In [7]: record = {} In [8]: for file in files: compound_key = (getsize(file),create_checksum(file)) if compound_key in record: dup.append(file) else: record[compound_key] = file ....: ....: In [9]: print dup ['/tmp/duplicates_directory/image2'] Example 6-9. Finding duplicates from checksum import create_checksum from diskwalk_api import diskwalk from os.path import getsize def findDupes(path = '/tmp'): dup = [] record = {} d = diskwalk(path) files = d.enumeratePaths() for file in files: compound_key = (getsize(file),create_checksum(file)) if compound_key in record: dup.append(file) else: #print "Creating compound key record:", compound_key record[compound_key] = file return dup if __name__ == "__main__": dupes = findDupes() for dup in dupes: print “Duplicate: %s” % dup Example 6-10. Delete module #!/usr/bin/env python import os class Delete(object): """Delete Methods For File Objects""" def __init__(self, file): self.file = file def interactive(self): """interactive deletion mode""" input = raw_input("Do you really want to delete %s [N]/Y" % self.file) if input.upper(): print "DELETING: %s" % self.file status = os.remove(self.file) else: print "Skipping: %s" % self.file return def dryrun(self): """simulation mode for deletion""" print "Dry Run: %s [NOT DELETED]" % self.file return def delete(self): """Performs a delete on a file, with additional conditions """ print "DELETING: %s" % self.file try: status = os.remove(self.file) except Exception, err: print err return status if __name__ == "__main__": from find_dupes import findDupes dupes = findDupes('/tmp') for dupe in dupes: delete = Delete(dupe) #delete.dryrun() #delete.delete() #delete.interactive() Example 6-11. Interactively using fnmatch and glob to search for file matches In [1]: from diskwalk_api import diskwalk In [2]: files = diskwalk("/tmp") In [3]: from fnmatch import fnmatch In [4]: for file in files: ...: if fnmatch(file,"*.txt"): ...: print file ...: ...: /tmp/file.txt In [5]: from glob import glob In [6]: import os In [7]: os.chdir("/tmp") In [8]: glob("*") Out[8]: ['file.txt', 'image.iso', 'music.mp3'] Example 6-12. Renaming a tree full of MP3 files to text files In [1]: from diskwalk_api import diskwalk In [2]: from shutil import move In [3]: from fnmatch import fnmatch In [4]: files = diskwalk("/tmp") In [5]: for file in files: if fnmatch(file, "*.mp3"): #here we can do anything we want, delete, move, rename...hmmm rename move(file, "%s.txt" % file) In [6]: ls -l /tmp/ total 0 -rw-r--r-- 1 ngift wheel 0 Apr 1 21:50 file.txt -rw-r--r-- 1 ngift wheel 0 Apr 1 21:50 image.iso -rw-r--r-- 1 ngift wheel 0 Apr 1 21:50 music.mp3.txt -rw-r--r-- 1 ngift wheel 0 Apr 1 22:45 music1.mp3.txt -rw-r--r-- 1 ngift wheel 0 Apr 1 22:45 music2.mp3.txt -rw-r--r-- 1 ngift wheel 0 Apr 1 22:45 music3.mp3.txt Example 6-13. Simple wrap of rsync #!/usr/bin/env python #wraps up rsync to synchronize two directories from subprocess import call import sys source = "/tmp/sync_dir_A/" #Note the trailing slash target = "/tmp/sync_dir_B" rsync = "rsync" arguments = "-a" cmd = "%s %s %s %s" % (rsync, arguments, source, target) def sync(): ret = call(cmd, shell=True) if ret !=0: print "rsync failed" sys.exit(1) sync() Example 6-14. An rsync command that doesn’t quit until the job is finished #!/usr/bin/env python #wraps up rsync to synchronize two directories from subprocess import call import sys import time """this motivated rsync tries to synchronize forever""" source = "/tmp/sync_dir_A/" #Note the trailing slash target = "/tmp/sync_dir_B" rsync = "rsync" arguments = "-av" cmd = "%s %s %s %s" % (rsync, arguments, source, target) def sync(): while True: ret = call(cmd, shell=True) if ret !=0: print "resubmitting rsync" time.sleep(30) else: print "rsync was succesful" subprocess.call("mail -s 'jobs done' [email protected]", shell=True) sys.exit(0) sync() Example 6-15. Creating metadata about a filesystem with SQLAlchemy #!/usr/bin/env python from sqlalchemy import create_engine from sqlalchemy import Table, Column, Integer, String, MetaData, ForeignKey from sqlalchemy.orm import mapper, sessionmaker import os #path path = " /tmp" #Part 1: create engine engine = create_engine('sqlite:///:memory:', echo=False) #Part 2: metadata metadata = MetaData() filesystem_table = Table('filesystem', metadata, Column('id', Integer, primary_key=True), Column('path', String(500)), Column('file', String(255)), ) metadata.create_all(engine) #Part 3: mapped class class Filesystem(object): def __init__(self, path, file): self.path = path self.file = file def __repr__(self): return "[Filesystem('%s','%s')]" % (self.path, self.file) #Part 4: mapper function mapper(Filesystem,filesystem_table) #Part 5: create session Session = sessionmaker(bind=engine, autoflush=True, transactional=True) session = Session() #Part 6: crawl file system and populate database with results for dirpath, dirnames, filenames in os.walk(path): for file in filenames: fullpath = os.path.join(dirpath, file) record = Filesystem(fullpath, file) session.save(record) #Part 7: commit to the database session.commit() #Part 8: query for record in session.query(Filesystem): print "Database Record Number: %s, Path: %s , File: %s " \ % (record.id,record.path, record.file) Example 6-16. Create big text file In [1]: f = open("largeFile.txt", "w") In [2]: statement = "This is a big line that I intend to write over and over again." ln [3]: x = 0 In [4]: for x in xrange(20000): ....: x += 1 ....: f.write("%s\n" % statement) ....: ....: In [4]: ls -l -rw-r--r-- 1 root root 1236992 Oct 25 23:13 largeFile.txt Example 6-17. TAR up contents of file In [1]: import tarfile In [2]: tar = tarfile.open("largefile.tar", "w") In [3]: tar.add("largeFile.txt") In [4]: tar.close() Example 6-18. TAR up contents of a directory tree In [27]: import tarfile In [28]: tar = tarfile.open("temp.tar", "w") In [29]: import os In [30]: for root, dir, files in os.walk("/tmp"): ....: for file in filenames: ....: KeyboardInterrupt In [30]: for root, dir, files in os.walk("/tmp"): for file in files: ....: fullpath = os.path.join(root,file) ....: tar.add(fullpath) ....: ....: In [33]: tar.close() Example 6-19. Creating bzip2 TAR archive In [1: tar = tarfile.open("largefilecompressed.tar.bzip2", "w|bz2") In [2]: tar.add("largeFile.txt") In [3]: ls -h foo1.txt fooDir1/ largeFile.txt largefilecompressed.tar.bzip2* foo2.txt fooDir2/ largefile.tar ln [4]: tar.close() In [5]: ls -lh -rw-r--r-- 1 root root 61M Oct 25 23:15 largeFile.txt -rw-r--r-- 1 root root 61M Oct 26 00:39 largefile.tar -rwxr-xr-x 1 root root 10K Oct 26 01:02 largefilecompressed.tar.bzip2* Example 6-20. Creating a gzip TAR archive In [10]: tar = tarfile.open("largefile.tar.gzip", "w|gz") In [11]: tar.add("largeFile.txt") ln [12]: tar.close() In [13]: ls -lh -rw-r--r-- 1 root root 61M Oct 26 01:20 largeFile.txt -rw-r--r-- 1 root root 61M Oct 26 00:39 largefile.tar -rwxr-xr-x 1 root root 160K Oct 26 01:24 largefile.tar.gzip*