Χρήστης:Vanished user Xorisdtbdfgonugyfs/dumptools.py
(Ανακατεύθυνση από Χρήστης:Xoristzatziki/dumptools.py)
#!/usr/bin/python # -*- coding: utf-8 -*- #Copyright Xoristzatziki of el.wiktionary.org ''' Works for: wikipedia wikiversity wikinews wikisource wikiquote wikibooks wiktionary ''' #import time import os, sys import urllib2 #import pygtk #import gtk #import datetime import subprocess knownprojectdumpnames = ['wiki', 'wikiversity', 'wikinews', 'wikisource', 'wikiquote', 'wikibooks', 'wiktionary',] basebackupurl = 'http://dumps.wikimedia.org/' kindofdump = u'-pages-meta-current.xml' class LatestDumpProps(): def __init__(self): self.url = u'' self.zippedfilename = u'' self.rawdate = u'' self.nonefound = True def urloflatestdump(wikilang, wikikind): '''Returns url, filename and string-of-date (if not in progress) ''' _wikitofind = wikilang + wikikind _ldp = LatestDumpProps() try: _response = urllib2.urlopen(basebackupurl + 'backup-index.html', None, 3) lines = _response.read() _response.close() for line in lines.splitlines(): #print line if _wikitofind in line[:]: #print line if '>Dump complete<' in line[:]: _backupsubdir = line.split('a href=\"')[1].split('">')[0] _ldp.rawdate = _backupsubdir.split('/')[1] #realdate = datetime.date(int(rawdate[:4]), int(rawdate[4:6]), int(rawdate[6:8])) #print realdate _ldp.zippedfilename = _wikitofind + '-' + _ldp.rawdate + kindofdump + '.bz2' _ldp.url = basebackupurl + _backupsubdir + '/' + _ldp.zippedfilename _ldp.nonefound = False return _ldp else: return _ldp #except IOError: except: return _ldp def getandextractadump(propsofdump, wheretosaveit): '''Gets and extracts latest dump. Warning: Overwrites any excisting files. Backup if you want them. Can be used to force getting latest dump ''' _ldp = propsofdump _saveitin = wheretosaveit #create file names and path of bz2 and xml files bzFilenamePath = os.path.join(_saveitin, _ldp.zippedfilename) #self.dumpfilenamepath = zFilenamePath[:-4] #remove old bz2 if os.path.exists(bzFilenamePath): try: os.remove(bzFilenamePath) except:#oops... print "Exception: ", str(sys.exc_info()) return False #else: #print 'File not found.' try: #prepare a new bz2 file with open(bzFilenamePath, 'w') as bzFile: #get from internet bz2 file bzFile.write(urllib2.urlopen(_ldp.url).read()) print 'Got bz2 file:', bzFilenamePath #unzip bz2 dump using bunzip2 #to the same place as downladed file #with the same name xmlfilename = os.path.splitext(os.path.basename(bzFilenamePath))[0] funziped = open(xmlfilename, 'w') subprocess.call(['bunzip2', '-f', bzFilenamePath], stdout=funziped) #also source (.bz2 file) will be deleted print 'Extracted:', bzFilenamePath return True, xmlfilename except IOError: return False,'' except:#oops... print "Exception: ", str(sys.exc_info()) return False,'' def titlesfilename(xmlfilename): return xmlfilename + '.titles' def unwiki(whichtext): _text = whichtext.decode('utf-8') # unescape characters _text = _text.replace('>', '>') _text = _text.replace('<', '<') _text = _text.replace('"', '"') _text = _text.replace('&', '&') #must be last return _text def get_site_from_dumpname(whichdumpname): #print 'inside' #print whichdumpname simpledumpname = os.path.basename(whichdumpname)#just in case #print simpledumpname simpledumpname = simpledumpname.split('-',1)[0] #print simpledumpname for x in knownprojectdumpnames: #print x if simpledumpname.endswith(x): project = x if project == 'wiki': project = 'wikipedia' lang = simpledumpname[:-len(x)] #print project, lang return project, lang return '','' def create_a_titles_file(xmlfilename, ftitles): try: with open(ftitles, 'w') as f:# = open(ftitles, 'w') #using grep subprocess.call(['grep', '-b', '-E', '<title>.+</title>', xmlfilename], stdout=f) print '"titles" file created...', ftitles return True except:#oops... print "Exception: ", str(sys.exc_info()) return False def create_newer_titles_file(xmlfilename,forcecreation = False):#TODO unused parameter '''Checks if a newer "title file" exists. If newer file does not exist means we have to create one. If no source exists (aka the xml dump) then return False. ''' if not os.path.exists(xmlfilename): #print 'no source file...' return False if xmlfilename.endswith('.titles'): #print 'source file is titles file...' return False ftitles = titlesfilename(xmlfilename) #print ftitles if os.path.exists(ftitles): #print '"titles" file exist. Checking dates...' if os.path.getmtime(xmlfilename) > os.path.getmtime(ftitles): #print '"titles" file is old. Creating new...' return create_a_titles_file(xmlfilename, ftitles) else: #print '"titles" file is ok.' return True else: return create_a_titles_file(xmlfilename, ftitles) #print 'came here...' return False class GetWikiText: def __init__( self): #self.texttofind = whichtext #self.infile = infile #self.data = data #print self.infile pass def get_from_online(self): commandvars = ['curl', '--retry', '10', '-s', '-f'] commandurl = self.site + 'w/api.php?format=xml&action=query&prop=revisions&titles=' commandurl += self.texttofind.decode('utf-8') + '&rvprop=user|content' commandvars.append(commandurl) try: content = subprocess.check_output(commandvars) return True, unwiki(content) except subprocess.CalledProcessError as e: errorcode = e.returncode return False,'Άγνωστο σφάλμα:' + str(e.returncode) + ' στο get_from_online()' def get_using_titles(self,titlesfile): lasttitle = False try: lines = subprocess.check_output(['grep', '-m','1','-A', '1', '>' + self.texttofind + '<', titlesfile]) startlines = lines.splitlines() #print startlines start1 = long(startlines[0].split(':',1)[0]) if len(startlines)>1: start2 = long(startlines[1].split(':',1)[0]) #print 'start' with open(self.infile, 'r') as f: f.seek(start1) #print f.tell() if len(startlines)>1: content = f.read(start2-start1) else: #print 'else' content = f.read() return True, unwiki( content) #startline1 = startline.split('\n',1) except subprocess.CalledProcessError as e: errorcode = e.returncode if errorcode == 1: return False,'Το λήμμα δεν βρέθηκε.' else: return False,'Άγνωστο σφάλμα:' + str(e.returncode) + ' στο get_using_titles()' except: return False,'Άγνωστο σφάλμα στο get_using_titles()' #return False,''# startline + '\n' + str(len(startline1)) def get_text(self, data): #self.data = data self.texttofind = data.text self.infile = data.file self.fromonline = data.fromonline self.site = data.site if self.fromonline: return self.get_from_online() #print 'not online' #f = open(ftitles, 'w') #using grep #with open(ftitles, 'w') if not os.path.exists(self.infile): return False,'no file specified' if len(self.texttofind)<1: return False,'no text specified' b = data if b.wants_as_title: #print 'as title' titlesfile = titlesfilename(self.infile) if os.path.exists(titlesfile): return self.get_using_titles(titlesfile) #else do normal search #either is titles file #or titles file does not exist c = ['grep'] if b.howmany > 0: c.append('-m') c.append(str(b.howmany)) if b.after > 0: c.append('-A') c.append(str(b.after)) elif b.before > 0: c.append('-B') c.append(str(b.before)) elif b.inbetween > 0: c.append('-C') c.append(str(b.inbetween)) #else:#force 250 #c.append('-A') #c.append('250') else: pass c.append(self.texttofind) c.append(self.infile) try: print c contents = subprocess.check_output(c) return True, unwiki(contents) except subprocess.CalledProcessError as e: errorcode = e.returncode if errorcode == 1: return False,'Το όρισμα δεν βρέθηκε.' else: return False,'Άγνωστο σφάλμα:' + str(e.returncode) + ' στο get_using_titles()' except: return False,u'exception occured' #generator===================================================================== class XmlEntry: """ Represents a reduced page. We do not check for redirects but exists for compatibility. """ def __init__(self, title, ns, text, redirect): self.title = title self.ns = ns self.text = text self.isredirect = redirect class WikiDump(): def __init__(self, dumpfilenamepath): '''Constructor. First checks for existence of dump file. Checks if file has titles file. Checks if file is titles file and has an xml file self._ISOK holds "if all OK" (dump file existed ). ''' self._ISOK = False if not os.path.exists(dumpfilenamepath): return self.dumpfilenamepath = dumpfilenamepath if not dumpfilenamepath.endswith('.titles'):#αν δεν είναι αρχείο titles if not os.path.exists(dumpfilenamepath + '.titles'):#αν δεν υπάρχει αρχείο titles return else:#υπάρχει αρχείο titles self.titlesfilename = dumpfilenamepath + '.titles' self._ISOK = True return else:#είναι αρχείο titles if not os.path.exists(dumpfilenamepath [:-len('.titles')]):#αν δεν υπάρχει το κανονικό return else:#υπάρχει το κανονικό self.dumpfilenamepath = dumpfilenamepath [:-len('.titles')] self.titlesfilename = dumpfilenamepath self._ISOK = True return def parse(self): '''Yields articles from a dump (xml file) Uses a titles file. Does not replaces '>' etc. nor converts to utf-8 since article may not be used (ex. if only ns '10' is needed). ''' fxml = open(self.dumpfilenamepath , 'r') ftitles = open(self.titlesfilename, 'r') ftitleslength = os.stat(self.titlesfilename).st_size start = long(u'0') title = u'' newstart = long(u'0') newtitle = u'' textinpage = u'' entryns = u'' entrytitle = u'' entrytext = u'' for titlesline in ftitles: if start > 0: fxml.seek(start) nextstart = long(titlesline[:long(titlesline.find(':'))]) nexttitle = titlesline.split('</title>')[0].split('<title>')[1] textinpage = fxml.read(nextstart-start) #print title if '<text xml:space="preserve" />' in textinpage: entryns = textinpage.split('<ns>')[1].split('</ns>')[0] entrytitle = title #can yield the entry if needed... #if entryns == '10': #print u'Το πρότυπο: ', entrytitle, ' είναι άδειο...........' #print ':', textsplited[0],':' title = nexttitle start = nextstart #do not yield it else: textsplited = textinpage.split('<text xml:space="preserve">') entryns = textsplited[0].split('<ns>')[1].split('</ns>')[0] entrytitle = title entrytext = textsplited[1].split('</text>')[0] title = nexttitle start = nextstart yield XmlEntry( ns = entryns, title = entrytitle, text = entrytext, redirect = '' ) else: #print 'first line' start = long(titlesline[:long(titlesline.find(':'))]) title = titlesline.split('</title>')[0].split('<title>')[1] #print 'first title: ', title, '--------------' #print 'lastline' fxml.seek(start) try: textinpage = fxml.read(ftitleslength - start) except OverflowError:#just in case print 'OverflowError... ',title, u'#', start, u'#', newstart, u'#' exit() fxml.close() ftitles.close() #yield rest as is. TODO crop text if '<text xml:space="preserve" />' in textinpage: entryns = textinpage.split('<ns>')[1].split('</ns>')[0] entrytitle = title #do not yield it else: textsplited = textinpage.split('<text xml:space="preserve">') entryns = textsplited[0].split('<ns>')[1].split('</ns>')[0] entrytitle = title entrytext = textsplited[1].split('</text>')[0] yield XmlEntry( ns = entryns, title = entrytitle, text = entrytext, redirect = '' )