#! /usr/bin/env python # retrieve_vis04_data.py import formatter import htmllib import os import re import string import urllib def _attrs2dict(attrs): """Take an (attribute, value) list and make a dict""" dict = {} for (a,v) in attrs: dict[a] = v return dict class FileList(htmllib.HTMLParser): """Logic to retrieve the first page which lists the files, then return the files' URLs for retrieval""" base_url = 'http://www.vets.ucar.edu/vg/isabeldata/' def __init__(self, save_dir, match_string='*', debug=False): htmllib.HTMLParser.__init__(self, formatter.NullFormatter()) self.match = re.compile(match_string) self.save_dir = save_dir self.debug = debug def do_it(self): u = urllib.urlopen(self.base_url) self.feed(u.read()) def start_a(self, attrs): """We're looking for links to .gz files""" d = _attrs2dict(attrs) link = d.get('href', ' ') if len(link) >= 3 and link[-3:] == '.gz': m = self.match.search(link) if m: #Found a matching file. #Get the file name for saving fn = os.path.split(link)[1] print 'getting ' + link + ' to ' + self.save_dir if not self.debug: try: urllib.urlretrieve(self.base_url + link, os.path.join(self.save_dir, fn)) except Exception, e: print '-- failure: ' + str(e) if __name__ == '__main__': ## f = FileList('h:/sources/python/testdata', 'CLOUDf1[2-4].*') import sys if len(sys.argv) < 3: print "Usage: {-d} save_dir matching_string (which is a regular expression)" else: if sys.argv[1] == '-d': debug = True dir = sys.argv[2] match = sys.argv[3] else: debug = False dir = sys.argv[1] match = sys.argv[2] f = FileList(dir, match, debug) f.do_it()