Χρήστης:Vanished user Xorisdtbdfgonugyfs/scripts/getnewpages.py
Μετάβαση στην πλοήγηση
Πήδηση στην αναζήτηση
#!/usr/bin/python3 #new approach #Get latest pages and update local xml dump #v0.0.4 #Step one: #Just get changed lemmas and save the dictionary with json. #TODO: #1. Update timestamp file after read. #2. Read new revisions of articles. #3. Merge them with the old dump. import re import urllib.request import os, glob import codecs import time import datetime import json import xml.etree.cElementTree as XMLPARSER #mywiktionarysite = 'http://el.wiktionary.org/' #myxmlfile = 'last_full.xml' #mytmpdir = ''#in case someone wnats a special tmp path #lasttimestampfile = '20140329000001' def getnonZtime(whichZtime): return whichZtime[:4] + whichZtime[5:7] + whichZtime[8:10] + whichZtime[11:13] + whichZtime[14:16] + whichZtime[17:19] def getallchanged(siteurl,Ztimestampofcurrentdump): allmyrecentchanges = {} timenow = datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S') timefromage = getnonZtime(Ztimestampofcurrentdump) rcstart = timenow continuevalue ='' stopedbyerrors = False while rcstart > timefromage: time.sleep(1) #print("getting changes from...") #print(siteurl) #continuevalue = '' #TODO check whether usage of continuevalue changed dictwithchanged,timetocontiuefrom = getrecentchanges(siteurl,rcstart,timefromage,continuevalue,allmyrecentchanges)#,rclimit=500) print("returned...") #on error we will not get a dictionary if type(dictwithchanged) == type(allmyrecentchanges): allmyrecentchanges = dictwithchanged rcstart = timetocontiuefrom.split("|")[0] #print('Continue from...', rcstart) else: print("ERROR") stopedbyerrors = True break if stopedbyerrors: timenow = timenow + '-stopedbyerrors' with open('latestchanges-' + timenow, 'w') as f: json.dump(allmyrecentchanges, f) print("SAVED") def getrecentchanges(siteurl,rcstart,rcend,rccontinue,allmyrecentchanges,rclimit=500): try: urldata = { 'action':'query', 'list':'recentchanges', 'format':'xml', 'rcstart':rcstart, 'rcend':rcend, 'uselang':'el', 'continue':rccontinue, 'rclimit':rclimit, 'maxlag':1, 'rcprop':'timestamp|title' } params = urllib.parse.urlencode(urldata) url = siteurl + 'w/api.php?%s' % params #print(url) headers = {} headers['User-Agent'] = "Bot For recentchanges" #"Mozilla/5.0 (X11; U; Linux ia64) Gecko/20071127 Firefox/2.0.0.11" req = urllib.request.Request(url, headers = headers) resp = urllib.request.urlopen(req) respData = resp.read() alldata = respData.decode('utf8') #print("len of alldata...", len(alldata) ) root = XMLPARSER.fromstring(alldata) rccontinuetime = '' for b in root.iter('continue'):#TODO Constantly check for changes in wiki's xml creator rccontinue = b.attrib['rccontinue'] for onerev in root.iter('rc'): #print("found rc...") #Δεν χρειάζεται να προσθέσω τα λήμματα πού ήδη έχω την τελευταία αλλαγή τους if (not len(allmyrecentchanges)) or (onerev.attrib['title'] not in allmyrecentchanges): allmyrecentchanges[onerev.attrib['title']] = {'timestamp':onerev.attrib['timestamp']} return allmyrecentchanges,rccontinue except Exception as e: print(str(e)) return('ERROR','')