# zope_static_mirror.py # version 0.1 """Script to create a static mirror of a zope site, like for burning on to a CD. Tries to correct URLs for documents within the site to always be relative links. Note: Still has trouble with documents which use acquisition. """ from xmlrpclib import ServerProxy, ProtocolError from htmllib import HTMLParser from formatter import NullFormatter import urllib2 import os import sys import errno import cPickle import re import time """Note that to use this script, you need to put a python script on your Zope server called A_list. A_list should have one parameter: list_of='' # python script A_list # parameter: list_of='' l = [] try: if not list_of: v = context.objectValues() else: v = context.objectValues(list_of) for c in v: id = c.getId() try: da = c.bobobase_modification_time().strftime('%Y-%m-%d %H:%M') except: da = DateTime().strftime('%Y-%m-%d %H:%M') l.append((id, da)) except: #raise l = [] return l """ # folder in which to store mirror local_base = 'mirror' domain = 'localhost:8080' # 2 lists of tuples: (subdomain name, remote and local folder name) # this one maps between subdomains and the folders they are stored in # list all domains here which should be fixed in any document fixup_domains = [('', '')] # this also maps between subdomains and the folders they are stored in # list those here that you want to actually download and mirror # notice that in my case, www.example.com and example.com both point # to the same content and live in a folder called www on the server # but live in the local_base folder in the mirror domains = [('', '')] # These are here in case I just want to update one section #domains = [('', '')] #domains = [('staff', 'staff')] #domains = [('students', 'students')] #domains = [('daily', 'news')] #domains = [('wiki', 'wiki')] # control program operation next_file_delay = 0 # seconds maximum_depth = 20 force_doc_update = 0 fixup_only = 0 if len(sys.argv) > 1: print 'Force doc update: ', sys.argv[1] force_doc_update = 1 local_file_temp_extension = '._original_' # To remove these files from the mirror before # starting to make the mirror cleanup_patterns = [ #'.save_folder_list', #'.save_file_list', #'.save_doc_list', #local_file_temp_extension ] cleanup_only = 0 folder_types = ['Folder', 'AnonyMail', 'ImagePagerFolder'] doc_types = ['DTML Method', 'DTML Document', 'ZWiki Page'] file_types = ['Image', 'File'] # These are objects usually acquired from the root directory force_toplevel_folders = ['images'] force_toplevel_files = ['favicon.ico'] force_toplevel_docs = ['images.html']#'new'] # Date style should match that used on the server side cur_date = time.strftime('%Y-%m-%d %H:%M') class Parser(HTMLParser): def clear_link_list(self): self.link_list = [] def start_a(self, attributes): for a in attributes: if a[0] == 'href': self.link_list.append(a[1]) def start_img(self, attributes): for a in attributes: if a[0] == 'src': self.link_list.append(a[1]) def start_td(self, attributes): for a in attributes: if a[0] == 'background': self.link_list.append(a[1]) def start_body(self, attributes): for a in attributes: if a[0] == 'background': self.link_list.append(a[1]) def start_link(self, attributes): for a in attributes: if a[0] == 'href': self.link_list.append(a[1]) def get_link_list(self): return self.link_list parser = Parser(NullFormatter()) class Mirror: def __init__(self, local_base, domain, domains, fixup_domains): self.local_base = local_base self.domain = domain self.domains = domains self.fixup_domains = fixup_domains def main(self): if cleanup_patterns: print 'Cleaning up mirror area' cleanup() print 'done' print if cleanup_only: sys.exit() print 'Retrieving files and domain fixup' for subdomain, folder in self.domains: full_domain = get_full_domain(subdomain, self.domain) self.local_folder = folder self.mirror_folder(full_domain, folder, depth=1) print 'done' print print 'Zope fixup' for subdomain, folder in domains: full_domain = get_full_domain(subdomain, self.domain) self.mirror_folder_zope(full_domain, folder, depth=1) print 'done' print def mirror_folder(self, full_domain, local_folder, depth=1): if maximum_depth and depth > maximum_depth: return self.depth = depth remote_folder = '/'.join(local_folder.split('/')[1:]) pathname = '%s/%s' % (self.local_base, local_folder) makedir(pathname) if not fixup_only: file_list = list_files(full_domain, remote_folder) saved_file_list = load_a_list(self.local_base, local_folder, '.save_file_list') mod_dates = get_saved_mod_dates(saved_file_list) #print folder, 'files', file_list for a_file, mod_date in file_list: b_file = a_file if a_file == 'index_html': b_file = 'index.html' local_file = get_full_path(local_base, local_folder, b_file) exists = os.path.exists(local_file) url = 'http://%s/%s/%s' % (full_domain, remote_folder, a_file) if not exists or (mod_dates.has_key(a_file) and mod_date > mod_dates[a_file]): delay() print 'retrieving', url sys.stdout.flush() f = file(local_file, 'w') data = readURL(url) f.write(data) else: print 'up-to-date %s/%s' % (local_folder, b_file) save_a_list(file_list, self.local_base, local_folder, '.save_file_list') saved_doc_list = load_a_list(self.local_base, local_folder, '.save_doc_list') if not fixup_only: doc_list = list_docs(full_domain, remote_folder) #print folder, 'docs', doc_list else: doc_list = saved_doc_list saved_mod_dates = get_saved_mod_dates(saved_doc_list) #print 'mod_dates', mod_dates for doc, mod_date in doc_list: b_doc = doc if doc == 'index_html': b_doc = 'index.html' local_file = get_full_path(self.local_base, local_folder, b_doc) local_file_temp = local_file + local_file_temp_extension exists = os.path.exists(local_file_temp) if force_doc_update or not exists or (saved_mod_dates.has_key(doc) and mod_date > saved_mod_dates[doc]): if exists and doc == 'index_html' and mod_date == cur_date and not force_doc_update: print 'not updating index_html' f_temp = file(local_file_temp) data = f_temp.read() f_temp.close() else: delay() print 'retrieving', full_path = get_full_path(full_domain, remote_folder, doc) url = 'http://%s' % full_path print url, sys.stdout.flush() f_ = file(local_file_temp, 'w') data = readURL(url) f_.write(data) f_.close() else: print 'up-to-date', f_temp = file(local_file_temp) data = f_temp.read() f_temp.close() print '%s/%s' % (local_folder, doc), print 'fixing', data = self.fixup(data, depth) f = file(local_file, 'w') f.write(data) f.close() print 'done' save_a_list(doc_list, self.local_base, local_folder, '.save_doc_list') if not fixup_only: folder_list = list_folders(full_domain, remote_folder) save_a_list(folder_list, self.local_base, local_folder, '.save_folder_list') #print folder, 'folders', folder_list else: folder_list = load_a_list(self.local_base, local_folder, '.save_folder_list') for subfolder, mod_date in folder_list: sf = '%s/%s' % (local_folder, subfolder) self.mirror_folder(full_domain, sf, depth+1) def undomain(self, matchobj): change_to = '../'*self.depth if self.local_folder: change_to += self.local_folder+'/' r = '%s="%sindex.html"' % (matchobj.group(1), change_to) print 'undomain', r return r def fixup(self, data, depth): # This section makes changes which do not need to be checked any place # else. ie, changing from absolute domain paths to local paths # and adding index_html to urls the reference folders # removes everything after the ? in urls # this is not ideal, but much more work # needs to be done to get this right data = re.sub(r'((HREF|href)="[^"]*)\?[^"]*"', '\g<1>"', data) for subdomain, local_folder in self.fixup_domains: full_domain = get_full_domain(subdomain, self.domain) change_from = "http://%s/" % full_domain if self.local_folder == '': if local_folder == '': change_to = '%s' % ('../' * (depth-1)) else: change_to = '%s%s/' % ('../' * (depth-1), local_folder) else: if local_folder == '': change_to = '%s' % ('../' * (depth)) else: change_to = '%s%s/' % ('../' * (depth), local_folder) data = data.replace(change_from, change_to) change_from = "http://%s" % full_domain data = data.replace(change_from, change_to) # also, add index_html to urls that end in / data = re.sub(r'(HREF|SRC|href|src)="(http://[^ ]*%s[^ ]*?/)"' % self.domain, add_index_html, data) data = re.sub(r'(HREF|SRC|href|src)="(/[^ ]*?/)"', add_index_html, data) data = re.sub(r'(HREF|SRC|href|src)="((?!http://)[^ ]*?/)"', add_index_html, data) return data def mirror_folder_zope(self, full_domain, local_folder, depth=1): # Need to take care of urls that start with / # and relative urls that might be acquiring something # from a higher level folder. # Also, it would be nice to make sure that urls that do not # end in / or /index_html actually point to a usable doc. # Check first to just make sure the link points at something. # If it is a folder, then add index_html # Otherwise, need to look around and try to figure out what it wants # should also check for links to external images if maximum_depth and depth > maximum_depth: return self.local_folder = local_folder self.depth = depth if local_folder: pathname = '%s/%s' % (self.local_base, local_folder) #print 'lb', self.local_base, 'lf', local_folder, 'depth', depth else: pathname = self.local_base doc_list = load_a_list(self.local_base, local_folder, '.save_doc_list') for doc, mod_date in doc_list: b_doc = doc if doc == 'index_html': b_doc = 'index.html' print '%s/%s fixup_zope' % (pathname, b_doc), local_file = get_full_path(self.local_base, local_folder, b_doc) f = file(local_file, 'r+') data = f.read() data = self.fixup_zope(data, depth) f.seek(0) f.truncate() f.write(data) f.close() print 'done' print folder_list = load_a_list(self.local_base, local_folder, '.save_folder_list') #print folder, 'saved folders', folder_list for subfolder, mod_date in folder_list: if local_folder: sf = '%s/%s' % (local_folder, subfolder) else: sf = subfolder #print 'f', local_folder, 'sf', subfolder, 'depth', depth self.mirror_folder_zope(full_domain, sf, depth+1) def unroot(self, matchobj): doc = matchobj.group(2)[1:] if self.local_folder: pathname = '/%s/%s' % (self.local_folder, doc) else: pathname = '%s' % (doc) change_to = '../'*(self.depth-1) if os.path.isdir(pathname): doc += '/index.html' r = '%s="%s%s"' % (matchobj.group(1), change_to, doc) print 'unroot', r return r def ensure_existence(self, link): if self.local_folder: pathname = '%s/%s/%s' % (self.local_base, self.local_folder, link) remotename = '%s/%s' % (self.local_folder, link) else: pathname = '%s/%s' % (self.local_base, link) remotename = link def is_folder(self, link): if self.local_folder: pathname = '%s/%s/%s' % (self.local_base, self.local_folder, link) else: pathname = '%s/%s' % (self.local_base, link) return os.path.isdir(pathname) def add_index_html(self, matchobj): doc = matchobj.group(2) doc += '/index.html' r = '%s="%s"' % (matchobj.group(1), doc) print 'add_index', r return r def fixup_zope(self, data, depth=1): parser.clear_link_list() parser.feed(data) link_list = parser.get_link_list() #print 'link_list', link_list for link in link_list: if link.startswith('https'): # leave these alone print 'secure', link if link.startswith('/'): print 'rooted', link data = re.sub(r'(HREF|href|SRC|src|BACKGROUND|background)="(%s)"' % link, self.unroot, data) if not link.endswith('/index_html') and self.is_folder(link): print 'add /index_html', link data = re.sub(r'(HREF|href|SRC|src)="(%s)"' % link, self.add_index_html, data) return data def add_index_html(matchobj): #print 'group', matchobj.group(), matchobj.group(1), matchobj.group(2) return '%s="%sindex.html"' % (matchobj.group(1), matchobj.group(2)) def list_all(full_domain, folder): url = 'http://%s/%s' % (full_domain, folder) try: server = ServerProxy(url) all_list = server.A_list() #print 'all_list', all_list except (ProtocolError, IOError), e: all_list = [] return all_list def list_folders(full_domain, folder): url = 'http://%s/%s' % (full_domain, folder) try: server = ServerProxy(url) folder_list = server.A_list(folder_types) except (ProtocolError, IOError), e: print 'WARNING list_folders failed. Returning empty list' folder_list = [] # for folders acquired from root... if not folder: folder_names = [item[0] for item in folder_list] for f in force_toplevel_folders: if f not in folder_names: folder_list.append((f, cur_date)) return folder_list def list_files(full_domain, folder): url = 'http://%s/%s' % (full_domain, folder) try: server = ServerProxy(url) file_list = server.A_list(file_types) except (ProtocolError, IOError), e: file_list = [] # for files acquired from root... if not folder: file_names = [item[0] for item in file_list] for f in force_toplevel_files: if f not in file_names: file_list.append((f, cur_date)) return file_list def list_docs(full_domain, folder): url = 'http://%s/%s' % (full_domain, folder) try: server = ServerProxy(url) doc_list = server.A_list(doc_types) #print 'doc_list', doc_list except (ProtocolError, IOError), e: doc_list = [] # every folder needs an interface, even if there is not one # in the folder itself. So, we insert index_html in to the # list if it is not there doc_names = [item[0] for item in doc_list] if 'index_html' not in doc_names: doc_list.append(('index_html', cur_date)) # for files acquired from root... if not folder: doc_names = [item[0] for item in doc_list] for f in force_toplevel_docs: if f not in doc_names: doc_list.append((f, cur_date)) return doc_list def save_a_list(a_list, base, folder, name): f = file('%s/%s/%s' % (base, folder, name), 'w') cPickle.dump(a_list, f) f.close() def load_a_list(base, folder, name): try: f = file('%s/%s/%s' % (base, folder, name)) a_list = cPickle.load(f) f.close() except IOError, e: if e.errno == errno.ENOENT: a_list = [] else: raise return a_list def get_saved_mod_dates(a_list): mod_dates = {} for file_name, date in a_list: mod_dates[file_name] = date return mod_dates def get_full_domain(subdomain, domain): if subdomain == '': full_domain = domain else: full_domain = "%s.%s" % (subdomain, domain) return full_domain def get_full_path(base, folder, doc): if folder == '': full_path = '%s/%s' % (base, doc) else: full_path = "%s/%s/%s" % (base, folder, doc) return full_path def readURL(url): """Returns the file found in the URL""" try: f = urllib2.urlopen(url) data = f.read() except urllib2.HTTPError, e: if e.code == 401: print 'password protected', e data = 'password protected' else: print 'other HTTP error', e data = 'HTTP Error' except AssertionError: print 'AssertionError' data = 'Assertion Error' try: f.close() except NameError: pass return data # Create a directory if it doesn't exist. # Create parent directories as well if needed. def makedir(pathname): try: os.makedirs(pathname, 0755) except OSError, e: if e.errno == errno.EEXIST: pass else: raise def delay(): if next_file_delay: print 'delaying %s seconds... ' % next_file_delay, sys.stdout.flush() time.sleep(next_file_delay) print 'proceeding' def cleanup(): for pat in cleanup_patterns: comm = 'find %s -name "*%s" | xargs rm' % (local_base, pat) os.system(comm) if __name__ == '__main__': mirror = Mirror(local_base, domain, domains, fixup_domains) mirror.main()