# zope_static_mirror.py
# version 0.1


"""Script to create a static mirror of a zope site, like
for burning on to a CD. Tries to correct URLs for documents
within the site to always be relative links.

Note:
Still has trouble with documents which use acquisition.

"""


from xmlrpclib import ServerProxy, ProtocolError
from htmllib import HTMLParser
from formatter import NullFormatter
import urllib2
import os
import sys
import errno
import cPickle
import re
import time


"""Note that to use this script, you need to put a python script on your
Zope server called A_list.  A_list should have one parameter: list_of=''


# python script  A_list
# parameter:  list_of=''

l = []

try:
    if not list_of:
        v = context.objectValues()
    else:
        v = context.objectValues(list_of)

    for c in v:
        id = c.getId()
        try:
            da = c.bobobase_modification_time().strftime('%Y-%m-%d %H:%M')
        except:
            da = DateTime().strftime('%Y-%m-%d %H:%M')
        l.append((id, da))

except:
    #raise
    l = []

return l

"""


# folder in which to store mirror
local_base = 'mirror'

domain = 'localhost:8080'

# 2 lists of tuples: (subdomain name, remote and local folder name)
# this one maps between subdomains and the folders they are stored in
# list all domains here which should be fixed in any document
fixup_domains = [('', '')]
# this also maps between subdomains and the folders they are stored in
# list those here that you want to actually download and mirror
# notice that in my case, www.example.com and example.com both point
# to the same content and live in a folder called www on the server
# but live in the local_base folder in the mirror
domains = [('', '')]

# These are here in case I just want to update one section
#domains = [('', '')]
#domains = [('staff', 'staff')]
#domains = [('students', 'students')]
#domains = [('daily', 'news')]
#domains = [('wiki', 'wiki')]


# control program operation
next_file_delay = 0 # seconds
maximum_depth = 20
force_doc_update = 0
fixup_only = 0

if len(sys.argv) > 1:
    print 'Force doc update: ', sys.argv[1]
    force_doc_update = 1

local_file_temp_extension = '._original_'

# To remove these files from the mirror before
# starting to make the mirror
cleanup_patterns = [
#'.save_folder_list',
#'.save_file_list',
#'.save_doc_list',
#local_file_temp_extension
]
cleanup_only = 0

folder_types = ['Folder', 'AnonyMail', 'ImagePagerFolder']
doc_types = ['DTML Method', 'DTML Document', 'ZWiki Page']
file_types = ['Image', 'File']

# These are objects usually acquired from the root directory
force_toplevel_folders = ['images']
force_toplevel_files = ['favicon.ico']
force_toplevel_docs = ['images.html']#'new']

# Date style should match that used on the server side
cur_date = time.strftime('%Y-%m-%d %H:%M')


class Parser(HTMLParser):
    def clear_link_list(self):
        self.link_list = []

    def start_a(self, attributes):
        for a in attributes:
            if a[0] == 'href':
                self.link_list.append(a[1])
    
    def start_img(self, attributes):
        for a in attributes:
            if a[0] == 'src':
                self.link_list.append(a[1])

    def start_td(self, attributes):
        for a in attributes:
            if a[0] == 'background':
                self.link_list.append(a[1])

    def start_body(self, attributes):
        for a in attributes:
            if a[0] == 'background':
                self.link_list.append(a[1])

    def start_link(self, attributes):
        for a in attributes:
            if a[0] == 'href':
                self.link_list.append(a[1])

    def get_link_list(self):
        return self.link_list

parser = Parser(NullFormatter())


class Mirror:
    def __init__(self, local_base, domain, domains, fixup_domains):
        self.local_base = local_base
        self.domain = domain
        self.domains = domains
        self.fixup_domains = fixup_domains
            
    def main(self):
        if cleanup_patterns:
            print 'Cleaning up mirror area'
            cleanup()
            print 'done'
            print
            if cleanup_only:
                sys.exit()
            
        print 'Retrieving files and domain fixup'
        for subdomain, folder in self.domains:
            full_domain = get_full_domain(subdomain, self.domain)
            self.local_folder = folder
            self.mirror_folder(full_domain, folder, depth=1)
        print 'done'
        print
            
        print 'Zope fixup'
        for subdomain, folder in domains:
            full_domain = get_full_domain(subdomain, self.domain)
            self.mirror_folder_zope(full_domain, folder, depth=1)
        print 'done'
        print
          
            
    def mirror_folder(self, full_domain, local_folder, depth=1):
        if maximum_depth and depth > maximum_depth:
            return

        self.depth = depth
            
        remote_folder = '/'.join(local_folder.split('/')[1:])
            
        pathname = '%s/%s' % (self.local_base, local_folder)
        makedir(pathname)

        if not fixup_only:
            file_list = list_files(full_domain, remote_folder)
            saved_file_list = load_a_list(self.local_base, local_folder, '.save_file_list')
            mod_dates = get_saved_mod_dates(saved_file_list)
            #print folder, 'files', file_list
            for a_file, mod_date in file_list:
                b_file = a_file
                if a_file == 'index_html':
                    b_file = 'index.html'
                local_file = get_full_path(local_base, local_folder, b_file)
                exists = os.path.exists(local_file)
                url = 'http://%s/%s/%s' % (full_domain, remote_folder, a_file)
                if not exists or (mod_dates.has_key(a_file) and mod_date > mod_dates[a_file]):
                    delay()
                    print 'retrieving', url
                    sys.stdout.flush()
                    f = file(local_file, 'w')
                    data = readURL(url)
                    f.write(data)
                else:
                    print 'up-to-date %s/%s' % (local_folder, b_file)
            save_a_list(file_list, self.local_base, local_folder, '.save_file_list')

        saved_doc_list = load_a_list(self.local_base, local_folder, '.save_doc_list')
        if not fixup_only:
            doc_list = list_docs(full_domain, remote_folder)
            #print folder, 'docs', doc_list
        else:
            doc_list = saved_doc_list
        saved_mod_dates = get_saved_mod_dates(saved_doc_list)
        #print 'mod_dates', mod_dates
        for doc, mod_date in doc_list:
            b_doc = doc
            if doc == 'index_html':
                b_doc = 'index.html'
            local_file = get_full_path(self.local_base, local_folder, b_doc)
            local_file_temp = local_file + local_file_temp_extension
            exists = os.path.exists(local_file_temp)

            if force_doc_update or not exists or (saved_mod_dates.has_key(doc) and mod_date > saved_mod_dates[doc]):
                if exists and doc == 'index_html' and mod_date == cur_date and not force_doc_update:
                    print 'not updating index_html'
                    f_temp = file(local_file_temp)
                    data = f_temp.read()
                    f_temp.close()
                else:
                    delay()
                    print 'retrieving',
                    full_path = get_full_path(full_domain, remote_folder, doc)
                    url = 'http://%s' % full_path
                    print url,
                    sys.stdout.flush()
                    f_ = file(local_file_temp, 'w')
                    data = readURL(url)
                    f_.write(data)
                    f_.close()
            else:
                print 'up-to-date',
                f_temp = file(local_file_temp)
                data = f_temp.read()
                f_temp.close()
            print '%s/%s' % (local_folder, doc),
            print 'fixing',
            data = self.fixup(data, depth)
            f = file(local_file, 'w')
            f.write(data)
            f.close()
            print 'done'
        save_a_list(doc_list, self.local_base, local_folder, '.save_doc_list')

        if not fixup_only:
            folder_list = list_folders(full_domain, remote_folder)
            save_a_list(folder_list, self.local_base, local_folder, '.save_folder_list')
            #print folder, 'folders', folder_list
        else:
            folder_list = load_a_list(self.local_base, local_folder, '.save_folder_list')
        for subfolder, mod_date in folder_list:
            sf = '%s/%s' % (local_folder, subfolder)
            self.mirror_folder(full_domain, sf, depth+1)

    def undomain(self, matchobj):
        change_to = '../'*self.depth
        if self.local_folder:
            change_to += self.local_folder+'/'
        r = '%s="%sindex.html"' % (matchobj.group(1), change_to)
        print 'undomain', r
        return r
        
    def fixup(self, data, depth):
        # This section makes changes which do not need to be checked any place
        # else. ie, changing from absolute domain paths to local paths
        # and adding index_html to urls the reference folders
        
        
        # removes everything after the ? in urls
        # this is not ideal, but much more work
        # needs to be done to get this right
        data = re.sub(r'((HREF|href)="[^"]*)\?[^"]*"', '\g<1>"', data)
    
        for subdomain, local_folder in self.fixup_domains:
            full_domain = get_full_domain(subdomain, self.domain)
            
            change_from = "http://%s/" % full_domain
            if self.local_folder == '':
                if local_folder == '':
                    change_to = '%s' % ('../' * (depth-1))
                else:
                    change_to = '%s%s/' % ('../' * (depth-1), local_folder)
                
            else:
                if local_folder == '':
                    change_to = '%s' % ('../' * (depth))
                else:
                    change_to = '%s%s/' % ('../' * (depth), local_folder)
            data = data.replace(change_from, change_to)

            change_from = "http://%s" % full_domain
            data = data.replace(change_from, change_to)
            
            # also, add index_html to urls that end in /
            data = re.sub(r'(HREF|SRC|href|src)="(http://[^ ]*%s[^ ]*?/)"' % self.domain, add_index_html, data)
            data = re.sub(r'(HREF|SRC|href|src)="(/[^ ]*?/)"', add_index_html, data)
            data = re.sub(r'(HREF|SRC|href|src)="((?!http://)[^ ]*?/)"', add_index_html, data)

        return data

        
    def mirror_folder_zope(self, full_domain, local_folder, depth=1):
        # Need to take care of urls that start with /
        # and relative urls that might be acquiring something
        # from a higher level folder.
        
        # Also, it would be nice to make sure that urls that do not
        # end in / or /index_html actually point to a usable doc.
        
        # Check first to just make sure the link points at something.
        # If it is a folder, then add index_html
        # Otherwise, need to look around and try to figure out what it wants
        
        # should also check for links to external images
        
        if maximum_depth and depth > maximum_depth:
            return

        self.local_folder = local_folder
        self.depth = depth
            
        if local_folder:
            pathname = '%s/%s' % (self.local_base, local_folder)
            #print 'lb', self.local_base, 'lf', local_folder, 'depth', depth
        else:
            pathname = self.local_base
            
        doc_list = load_a_list(self.local_base, local_folder, '.save_doc_list')
        for doc, mod_date in doc_list:
            b_doc = doc
            if doc == 'index_html':
                b_doc = 'index.html'
            print '%s/%s fixup_zope' % (pathname, b_doc),
            local_file = get_full_path(self.local_base, local_folder, b_doc)
            f = file(local_file, 'r+')
            data = f.read()
            data = self.fixup_zope(data, depth)

            f.seek(0)
            f.truncate()
            f.write(data)
            f.close()
            print 'done'
            print
        
        folder_list = load_a_list(self.local_base, local_folder, '.save_folder_list')
        #print folder, 'saved folders', folder_list
        for subfolder, mod_date in folder_list:
            if local_folder:
                sf = '%s/%s' % (local_folder, subfolder)
            else:
                sf = subfolder
            #print 'f', local_folder, 'sf', subfolder, 'depth', depth
            self.mirror_folder_zope(full_domain, sf, depth+1)


    def unroot(self, matchobj):
        doc = matchobj.group(2)[1:]
        
        if self.local_folder:
            pathname = '/%s/%s' % (self.local_folder, doc)
        else:
            pathname = '%s' % (doc)

        change_to = '../'*(self.depth-1)
        
        if os.path.isdir(pathname):
            doc += '/index.html'

        r = '%s="%s%s"' % (matchobj.group(1), change_to, doc)
        print 'unroot', r
        return r

    def ensure_existence(self, link):
        if self.local_folder:
            pathname = '%s/%s/%s' % (self.local_base, self.local_folder, link)
            remotename = '%s/%s' % (self.local_folder, link)
        else:
            pathname = '%s/%s' % (self.local_base, link)
            remotename = link

    def is_folder(self, link):
        if self.local_folder:
            pathname = '%s/%s/%s' % (self.local_base, self.local_folder, link)
        else:
            pathname = '%s/%s' % (self.local_base, link)
        return os.path.isdir(pathname)
            
    def add_index_html(self, matchobj):
        doc = matchobj.group(2)
        doc += '/index.html'
        
        r = '%s="%s"' % (matchobj.group(1), doc)
        print 'add_index', r
        return r
        
    def fixup_zope(self, data, depth=1):
        parser.clear_link_list()
        parser.feed(data)
        link_list = parser.get_link_list()
        #print 'link_list', link_list

        for link in link_list:
            if link.startswith('https'):
                # leave these alone
                print 'secure', link
            if link.startswith('/'):
                print 'rooted', link
                data = re.sub(r'(HREF|href|SRC|src|BACKGROUND|background)="(%s)"' % link, self.unroot, data)
            if not link.endswith('/index_html') and self.is_folder(link):
                print 'add /index_html', link
                data = re.sub(r'(HREF|href|SRC|src)="(%s)"' % link, self.add_index_html, data)

        return data

        
def add_index_html(matchobj):
    #print 'group', matchobj.group(), matchobj.group(1), matchobj.group(2)
    return '%s="%sindex.html"' % (matchobj.group(1), matchobj.group(2))


def list_all(full_domain, folder):
    url = 'http://%s/%s' % (full_domain, folder)
    try:
        server = ServerProxy(url)
        all_list = server.A_list()
        #print 'all_list', all_list
    except (ProtocolError, IOError), e:
        all_list = []
    return all_list


def list_folders(full_domain, folder):
    url = 'http://%s/%s' % (full_domain, folder)
    try:
        server = ServerProxy(url)
        folder_list = server.A_list(folder_types)
    except (ProtocolError, IOError), e:
        print 'WARNING list_folders failed. Returning empty list'
        folder_list = []

    # for folders acquired from root...
    if not folder:
        folder_names = [item[0] for item in folder_list]
        for f in force_toplevel_folders:
            if f not in folder_names:
                folder_list.append((f, cur_date))
    
    return folder_list


def list_files(full_domain, folder):
    url = 'http://%s/%s' % (full_domain, folder)
    try:
        server = ServerProxy(url)
        file_list = server.A_list(file_types)
    except (ProtocolError, IOError), e:
        file_list = []

    # for files acquired from root...
    if not folder:
        file_names = [item[0] for item in file_list]
        for f in force_toplevel_files:
            if f not in file_names:
                file_list.append((f, cur_date))

    return file_list


def list_docs(full_domain, folder):
    url = 'http://%s/%s' % (full_domain, folder)
    try:
        server = ServerProxy(url)
        doc_list = server.A_list(doc_types)
        #print 'doc_list', doc_list
    except (ProtocolError, IOError), e:
        doc_list = []
        
    # every folder needs an interface, even if there is not one
    # in the folder itself. So, we insert index_html in to the
    # list if it is not there
    doc_names = [item[0] for item in doc_list]
    if 'index_html' not in doc_names:
        doc_list.append(('index_html', cur_date))
    # for files acquired from root...
    if not folder:
        doc_names = [item[0] for item in doc_list]
        for f in force_toplevel_docs:
            if f not in doc_names:
                doc_list.append((f, cur_date))

    return doc_list


def save_a_list(a_list, base, folder, name):
    f = file('%s/%s/%s' % (base, folder, name), 'w')
    cPickle.dump(a_list, f)
    f.close()


def load_a_list(base, folder, name):
    try:
        f = file('%s/%s/%s' % (base, folder, name))
        a_list = cPickle.load(f)
        f.close()
    except IOError, e:
        if e.errno == errno.ENOENT:
            a_list = []
        else:
            raise
    
    return a_list


def get_saved_mod_dates(a_list):
    mod_dates = {}
    for file_name, date in a_list:
        mod_dates[file_name] = date
    
    return mod_dates


def get_full_domain(subdomain, domain):
    if subdomain == '':
        full_domain = domain
    else:
        full_domain = "%s.%s" % (subdomain, domain)
    
    return full_domain
    

def get_full_path(base, folder, doc):
    if folder == '':
        full_path = '%s/%s' % (base, doc)
    else:
        full_path = "%s/%s/%s" % (base, folder, doc)
    
    return full_path
    
   
def readURL(url):
    """Returns the file found in the URL"""

    try:
        f = urllib2.urlopen(url)
        data = f.read()
    except urllib2.HTTPError, e:
        if e.code == 401:
            print 'password protected', e
            data = 'password protected'
        else:
            print 'other HTTP error', e
            data = 'HTTP Error'
    except AssertionError:
        print 'AssertionError'
        data = 'Assertion Error'
                            
    try:
        f.close()
    except NameError:
        pass
    
    return data
       

# Create a directory if it doesn't exist.
# Create parent directories as well if needed.
def makedir(pathname):
    try:
        os.makedirs(pathname, 0755)
    except OSError, e:
        if e.errno == errno.EEXIST:
            pass
        else:
            raise


def delay():
    if next_file_delay:
        print 'delaying %s seconds... ' % next_file_delay,
        sys.stdout.flush()
        time.sleep(next_file_delay)
        print 'proceeding'


def cleanup():
    for pat in cleanup_patterns:
        comm = 'find %s -name "*%s" | xargs rm' % (local_base, pat)
        os.system(comm)
    
    
if __name__ == '__main__':
    mirror = Mirror(local_base, domain, domains, fixup_domains)
    mirror.main()