Colapse backup

Full backup with large ZIP files on removable media (eg. DVD or CD-ROM). Target platforms are file servers (Unix) with large structured directories of several gigabytes. The main benefit of such backup is quick partial restore as the files are inteligentely backed withhout continuation prerequisite. This also means that one failed disk or archive wil not affect restore of other disks.

Overview

This is utility for preparing data for compressed full backup of directory trees. It collect files into subarchives of type ZIP which can be opened with winzip or similar unix tool. The aim of this utilities is to prepare data for DVD or CD-ROM writer with logically dissected directory tree and to fill up removable media as much as possible.

The backup data preparation is a two way process:

  1. Colapse a tree of small subdirectories into one zip file with a name of subdirectory tree. Collect the files for archiving in granulated ZIP files.
  2. Create backup plan with the size of the target media with symbolic linking. Large ZIP archives are splitted with bin packing into two pieces to fit the media fully.
The process is a two way AWK script because the first colapse normally runs for several hours if there is gigabytes to backup. split-disk is performed relatively quickly and can be run with several trial ups to get the overview to the number of removable media needed for backup.

Usage

Edit colapse.awk and disk-split.awk for correct directories and file sizes.

awk -f colapse.awk
awk -f disk-split.awk

I recommend to run this with a batch, at or nohup command.

Removable media sizes

In is allways recommended that you check the media size within your recorder. DVD media size is like in hard disks.
  • For DVD 4.7 GB == 4700000 KB == 4588 MB
  • DVD-R and DVD+R are somewhat smaller than DVD-R or DVD+R - 4540 MB
  • CD-R and CD-RW eg. 700 MB == 700*1024*1024 B
  • Caveats

    Up to 100 disk is a sane size for removable media backup. Only files and no symbolic links are archived. Change to find -type f -or -type l when collecting files if you want this functionality.

    Download

  • FreeBSD 5 version
  • Linux version
  • Source code

    This is FreeBSD version which differs from linux only in find capabilities. For other unicces the < code>find command can differ substantially.

    colapse.awk

    # This AWK script will create a compressed backup tree with ZIP archiver
    # limiting the max size of the .zip files to GRANULATION.
    # If directorty is too big then subdirectories are recursively compressed.
    # Files in big directories are splited into several archives if needed.
    # Note that ZIP file size is limited to 2^31 bytes (2GB)
    # usage: awk -f colapse.awk
    # author: (C) 2003, Leon Kos
    # License: GPL
    # OS: BSD
    
    BEGIN{
    
      SRC="/home/home/staff";
      DST="/home/staff-backup";
    
    # File size granuality in Megabytes
      GRANULATION=2000;
     
      GRANULATION *= 1024 * 1024;
    
      ZIP_OPTS="-9q";
     
      FS="\t";
     
     
      system("rm -rf " DST);
     
      findcmd = "find " SRC " -type d -print"; 
      while((findcmd | getline) > 0)
        {
          sub(SRC, "");
          sub(/^\//, "");
          dir_name = $0;
          cmd = "du -sk \"" SRC "/" dir_name "\"";
          cmd | getline; 
          size = $1 * 1024;
          close(cmd);
          
          if (size > GRANULATION) # large directory
    	{
    	  tree["/" dir_name] = size;
    	  cmd = "mkdir \"" DST "/" dir_name "\"";
    	  print DST "/" dir_name
    #	  print cmd;
              system(cmd);
    	  collect_files(SRC, DST, dir_name);
    	}
          else # Small leaf directories are recursively stored into archives
    	{
    	  depth = split(dir_name, A, "/");
    	  parent="";
    	  for (i = 1; i < depth; i++)
    	    parent = parent "/" A[i];
    
    	  if (tree[parent] || parent == "")
    	    {
    	      cmd = "cd " SRC "; zip " ZIP_OPTS " -r \"" \
    		DST "/" dir_name ".zip\" \"" dir_name "\"";
    #	      print cmd, "size:", size;
    	      system(cmd);
    
    	    }
    	  else
    	    {
    #	      print dir_name " covered by " parent ".zip size:", size
    	    }
    	    
    	}
        }
    
      close(findcmd);
      exit(0);
    
    }
    
    # Collect files in big directoies and create splitted archives
    # without subdirectories
    function collect_files(src, dst, dir_name)
    {
      cmd = "find \"" src "/" dir_name "\" -type f -maxdepth 1 -print0 | xargs -0 stat -f '%z%t%N'";
    #  print cmd;
      total = 0;
      archive_number=0;
      file_list = "";
      while((cmd | getline) > 0)
        {
          size = $1;
    
          sub(src, "", $2);
          sub(/^\//, "", $2);
    
          file_name = $2;
    
    #      print dir_name, size, file_name, $0;
    
          file_list = file_list file_name "\n";
          total += size;
          if (total > GRANULATION)
    	{
    	  zipcmd = "cd " src "; echo \"" file_list "\" | zip " ZIP_OPTS \
    		 " -@ \"" dst "/" dir_name "/#" archive_number "\"";
    #	  print  zipcmd;
    	  system(zipcmd);
    
    	  file_list = "";
    	  total = 0;
    	  archive_number ++;
    	}
        }
      close(cmd);
      if (total > 0)
        {
    	zipcmd = "cd " src "; echo \"" file_list "\" | zip " ZIP_OPTS \
    		 " -@ \"" dst "/" dir_name "/#" archive_number "\"";
    #     print zipcmd;
          system(zipcmd);
        }
    }
    
    

    disk-split.awk

    # CDR  & DVD backup
    #
    # We assume backup tree with colapse.awk utility.
    # The size of compressed archives should be at most of size of the target 
    # removable media. This will assure that splitted archive will span over 
    # at most two disks. This recomendation is not obligatory for this utility.
    # If you span archive over more disks, more time/disk space is needed for
    # spliting. Again. Please note that there is 2^31 file size limitation!
    # This mean that zip archive cannot span more than two DVD media!
    #    Small archives are only symbolicly linked. For creating real disk 
    # images one should use symbolic link dereference with utility which
    # will transfer files on media! For example use "du -L *" in created
    # disk directories to verify that the largest disk will fit the media!
    #    To prevent that archive will be splitted over two disks, with
    # a small head on one disk and large tail on another the KEEP_TOGETHER 
    # parameter assigns alowable space waste (tolerance) to be left on one disk
    # just to keep things together (eg. 3% of disk). Please note that space
    # waste occurs also in zipsplit utility. Als sizes are given in BYTES.
    # I recommend starting with greedy 1% KEEP_TOGETHER and increasing it 
    # until the number of disks required for whole backup is the same. If
    # one wants to rely on zipsplit bin packing with no widow protection, it
    # is also admissible to set KEEP_TOGETHER=0;
    #
    # author: (C) 2003, Leon Kos
    # License: GPL
    # usage: awk -f disk-split.awk
    # OS: BSD
    #
    BEGIN {
    #temporary storage of large zips created with colapse.awk
      SRC="/home/staff-backup";
    
    #splitted zips for backup to CDR
      DST="/home/dvd-split";
    
      MB=1024*1024
      DISK_SIZE=700*MB; #CDR
      DISK_SIZE=4400*MB;#DVD
    
      KEEP_TOGETHER = DISK_SIZE/100*2; # Prevent widow archives
      
      FS="\t";
    
      system("rm -rf " DST "; mkdir " DST "; mkdir " DST "/disk00");
      
      disk=0;
      total_size = 0;
    
      findcmd = "find " SRC " -type f -name '*.zip' -print0 |" \
    	" xargs -0 stat -f %z%t%N";
      
      while( (findcmd | getline) > 0)
        {
    
          size = $1;
          filename = $2;
    
          if ((total_size + size > DISK_SIZE) && (size >= KEEP_TOGETHER))
    	{
    
    	  
    	  if (DISK_SIZE > 2^31) # zip and awk size limitation
    	  {
    	       cmdsplit = sprintf("zipsplit -n " 2^31 - 1 \
    				" -r "  2^31 - 1 - (DISK_SIZE - total_size) \
    			     " -b " DST "/disk%02d \"" filename"\"", disk); 
    	  }
    	  else
    	    {
    	       cmdsplit = sprintf("zipsplit -n " DISK_SIZE " -r " total_size \
    			     " -b " DST "/disk%02d \"" filename"\"", disk); 
                }
    	  
    #	  print cmdsplit; 
    	  cmdsplit | getline;
    	  total_zips = $1;
    	  
    #	  print "Total of " total_zips " for " filename;
    
    	  cmdsplit | getline;
    	  sub("creating: ", ""); 
    
    	  archive = 0;
    	  
    	  cmd = sprintf("mv \"" $0 "\" \"" DST "/disk%02d/" \
    			mangle(filename) "." archive ".zip\"", disk);
    #	  print cmd; 
    	  system(cmd);
    
    	  while ( (cmdsplit | getline) > 0) # should run only once
    	    {
    	      disk++;
    	      archive ++;
    	      cmd = sprintf("mkdir " DST "/disk%02d", disk); 
    	      print cmd; system(cmd);
    
    	      sub("creating: ", ""); 
    # We need size for last disk to fill up
    	      cmd = "stat -f %z " $0;
    	      if( cmd | getline total_size == 1)
    		close(cmd);
    	      else
    		exit(3);
    	      
    	      cmd = sprintf("mv \"" $0 "\" \"" DST "/disk%02d/" \
    			    mangle(filename) "." archive ".zip\"", disk);
    #	      print cmd; 
    	      system(cmd);
    	    }
    	  close(cmdsplit); 
    	}
          else # just link to original
    	{
    	  if ((total_size + size > DISK_SIZE) && (size < KEEP_TOGETHER))
    	    {
    	      disk ++;
    	      cmd = sprintf("mkdir " DST "/disk%02d", disk);
    	      system(cmd);
    	      total_size = 0;
    	      print "Widow protection for ", filename;
    	    }
    
    	  total_size += size;
    	  cmd = sprintf("ln -s \"" filename "\" \"" \
    			DST "/disk%02d/" mangle(filename) ".zip\"",  disk);
    	  system(cmd);
    #	  print size, cmd;
    	}
        }
      close(findcmd);
      exit(0)
    }
    
    function mangle(filename)
    {
      mangled = filename;
      sub(SRC, "", mangled);
      sub(DST, "", mangled);
      sub(/^\//,"", mangled);
      gsub(/\//, "-", mangled);
    #  sub("#", ".", mangled);
      sub(/\.zip$/, "", mangled);
      return mangled;
    }