import feedparser, string, sys, libxml2, urllib, re, traceback, pickle, os, mx.DateTime from dbxml import * dbxml = './dbxml/blog.dbxml' opml = 'http://weblog.infoworld.com/udell/gems/mySubscriptions.opml' dictEtagFile = 'dictEtag' def formatExceptionInfo(maxLevel=5): cla, exc, trbk = sys.exc_info() excName = cla.__name__ try: excArgs = exc.__dict__["args"] except: excArgs = "" excTb = traceback.format_tb(trbk, maxLevel) return ( excName, excArgs, excTb ) def unpack(result): ret = [] try: channel = result['channel']['title'] except: print 'NoChannelTitle' return ret try: items = result['items'] except: print 'NoItems' return ret for item in items: try: content = item['description'] except: content = item['content'][0]['value'] content = '%s' % content try: xml = libxml2.parseDoc(content) xml.freeDoc() except: return ret try: title = item['title'] title = title.replace('&','&') except: title = None hashValue = hash ( content ) try: hashDict[str(hashValue)] count = 1 except KeyError: count = 0 if ( count ): continue else: sys.stderr.write("New: %s\n" % title) try: link = item['link'] link = urllib.quote(link) link = link.replace('http%3A','http:') except: link = None try: tmp = item['date'] tmp = mx.DateTime.DateFrom(tmp) date = tmp.Format('%Y/%m/%d') except: date = '0000/00/00' newitem = """ %s %s %s %s """ % ( channel, hash(content), title, link, date, content ) ret.append ( newitem ) return ret try: os.stat(dictEtagFile) except: print "CreatingDictEtag" f = open(dictEtagFile,'wb') dictEtag = {} pickle.dump(dictEtag,f) f.close() try: f = open(dictEtagFile, 'rb') dictEtag = pickle.load(f) f.close() except: raise "CannotLoadDictEtag" opmldata = urllib.urlopen(opml).read() opmlxml = libxml2.parseDoc(opmldata) urls = opmlxml.xpathEval('//@xmlUrl') urls = map ( libxml2.xmlNode.getContent, urls) container = XmlContainer(None, dbxml) container.open(None,DB_CREATE) context = XmlQueryContext(1,0) hashResults = container.queryWithXPath(None, '/item/@hash', context) hashDict = {} try: for i in range(hashResults.size()): hashValue = hashResults.next().asString(None) hashDict[hashValue] = hashValue except: print "CannotCreateHashDict" container.close() newdata = '' for url in urls: print "%s " % url try: try: dictEtag[url] except KeyError: dictEtag[url] = {} try: etag = dictEtag[url]['etag'] except: etag = None try: mod = dictEtag[url]['mod'] except: mod = None result = feedparser.parse(url, etag, mod) try: etag = result['etag'] dictEtag[url]['etag'] = etag except: etag = None pass try: mod = result['modified'] dictEtag[url]['mod'] = mod except: mod = None pass if ( etag == None and mod == None ): print "%s: no etag or mod" % url if ( result['status'] == 304 ): continue items = unpack ( result ) if len ( items ): container.open(None,DB_CREATE) for item in ( items ): doc = XmlDocument() doc.setContent(item) print container.putDocument(None, doc) container.close() except: l = formatExceptionInfo() print l continue try: f = open(dictEtagFile,'wb') pickle.dump(dictEtag,f) f.close() except: print "CannotSaveDictEtag"