Χρήστης:AtouBot/getrcs.sh
Από Βικιλεξικό
Δείτε επίσης: Χρήστης:AtouBot/getrcs.sh/docs
Πίνακας περιεχομένων |
[
] getrcs.sh
#!/bin/bash # TODO # # check for available disk space at beginning of run # of any of the three parts. We will need at least size($lastfull) bytes # plus (finding out in a minute) usage() { echo "Usage: $0 startdate endate [configfile]" echo "where startdate is latest date from which to get changes" echo "and enddate is the earliest date, in the local timezone." echo "The base date may be specified as either today, or lastrun," echo "where lastrun is the latest date you got changes from" echo "during the previous run." echo echo "For example:" echo "$0 today today-3d" echo "$0 today-1h today-5h" echo "$0 today lastrun" echo "If you omit the d or h the increment is interpreted as days" echo echo "Alternatively you can specify absolute timestamps." echo "These must be in the format yyyy-mm-ddThh:mm:ssZ" echo "For example:" echo "$0 2008-02-06T08:54:06Z 2008-01-23T08:00:00Z" echo "In this case the times are interpreted as UTC times." echo echo "The optional configfile argument tells the script to use" echo "the config file you specify instead of the default config.txt" exit 1 } if [ -z "$1" ] || [ -z "$2" ]; then usage fi if [ ! -z "$3" ]; then if [ -e "$3" ]; then source "$3" else echo "Specified config file $3 does not exist." usage fi else source ./config.txt fi if [ ! -e "./sort.pl" ] || [ ! -e "./uniq.pl" ]; then echo "You are missing one or both of the files sort.pl or uniq.pl which should be in the same directory" echo "from which you run this command. Please put them in place and run this again." exit 1 fi usage_lastrun() { echo "In order to use lastrun+-(d|h), you need to have the timestamp of the last run" echo "stored in the file $lastrun in the current directory. To get the appropriate" echo "timestamp, run" echo 'date +%s -d "yyyy-mm-dd hh:mm:ss +0000" > $lastrun' echo "Then run this script again." exit 1 } checkformat() { local d d="$1" if [ -z "$d" ]; then secs=`date +%s` return $secs fi hasZ=`echo $1 | grep Z` if [ ! -z "$hasZ" ]; then # μορφή ως: 2008-01-23T08:00:00Z # μετατροπή σε: 2008-01-23 08:00:00 +0000 reformatted=`echo $1 | sed -e 's/T/ /; s/Z/ +0000/;'` secs=`date --date="$reformatted" +%s` return $secs fi minus=`echo "$d" | grep -e '-'` plus=`echo "$d" | grep -e '+'` if [ ! -z "$minus" ]; then op="-" elif [ ! -z "$plus" ]; then op="+" else op="" fi if [ -z "$op" ]; then basedate=$d incr=0 incrtype="d" else basedate=`echo $d | awk -F"$op" '{ print $1 }'` incr=`echo $d | awk -F"$op" '{ print $2 }'` incrtype="d" fi if [ ! -z "$incr" ]; then day=`echo "$incr" | grep 'd'` hour=`echo "$incr" | grep 'h'` if [ ! -z "$day" ]; then incrtype="d" elif [ ! -z "$hour" ]; then incrtype="h" fi incr=`echo $incr | sed -e "s/$incrtype//"` if [ -z "$incr" ]; then incr='0' fi fi case $basedate in 'today') today=`date -u +"%Y-%m-%d %H:%M:%S +0000"` secs=`date +%s -d "$today"` ;; 'lastrun') if [ ! -e "$lastrun" ]; then usage_lastrun exit 1 fi lastdaterun=`cat $lastrun` testdate=`date -d @"$lastdaterun"` if [ $? -ne 0 ]; then usage_lastrun fi secs=`date +%s -d @"$lastdaterun"` ;; *) usage ;; esac case $incrtype in 'd') incr=$(( $incr*86400 )) ;; 'h') incr=$(( $incr*3600 )) ;; *) ;; esac case $op in '-') secs=$(( $secs-$incr )) ;; '+') secs=$(( $secs+$incr )) ;; '') ;; *) usage esac return 0 } checkformat "$1" startdatesecs=$secs checkformat "$2" enddatesecs=$secs ext=`date +%m-%d-%Y -d @$startdatesecs` globstartdate=`date -u -d @$startdatesecs +"%Y-%m-%dT%H:%M:%SZ"` globenddate=`date -u -d @$enddatesecs +"%Y-%m-%dT%H:%M:%SZ"` lastdaterun="$startdatesecs" me=`basename $0` mkdir -p $tmp changes="$tmp/changes.$ext" moves="$tmp/moves.$ext" imports="$tmp/imports.$ext" uploads="$tmp/uploads.$ext" deletes="$tmp/deletes.$ext" pages="$tmp/pages.$ext" titles="$tmp/titles.$ext" full="$tmp/full.$ext" lastfull="$snapshotdir/$snapshot" case $me in 'getchanges.sh') do="changes" ;; 'getmoves.sh') do="moves" ;; 'getimports.sh') do="imports" ;; 'getuploads.sh') do="uploads" ;; 'getdeletes.sh') do="deletes" ;; 'getpages.sh') do="pages" ;; 'domerges.sh') do="merges" ;; *) rm -f $titles.* $pages.* $deletes.* $moves.* $imports.* $uploads.* $changes.* do="all" ;; esac if [ "$do" != "all" ]; then rm -f "$tmp/$do".* fi # πρόσφατες αλλαγές rcstartdate=$globstartdate rcenddate=$globenddate while [ 1 ]; do if [ "$do" != "changes" ] && [ "$do" != "all" ]; then break; fi echo getting recent changes $rcstartdate to $rcenddate # παίρνουμε τις επόμενες γραμμές από την καταγραφή πρόσφατων αλλαγών case "$snapshottype" in "fullwithusers" | "titleswithusers" ) curl --retry 10 -H "Expect:" -f "http://$wiki/w/api.php?action=query&list=recentchanges&rclimit=500&rctype=new|edit&format=xml&rcstart=$rcstartdate&rcend=$rcenddate" > $changes.raw ;; "full" | "titles") curl --retry 10 -H "Expect:" -f "http://$wiki/w/api.php?action=query&list=recentchanges&rclimit=500&rctype=new|edit&format=xml&rcstart=$rcstartdate&rcend=$rcenddate&rcnamespace=0" > $changes.raw ;; *) echo "Unknown snapshot type. Please check your configuration file and" echo "run this step again." exit 1 esac if [ $? -ne 0 ]; then echo "Error $? from curl, unable to get recent changes, bailing" exit 1 fi if [ -e "$changes.cmp" ]; then aredone=`cmp $changes.raw $changes.cmp` if [ -z "$aredone" ]; then break; fi fi cp $changes.raw $changes.cmp cat $changes.raw >> $changes.raw.save # παίρνουμε τους τίτλους case "$snapshottype" in "fullwithusers" | "full") cat $changes.raw | sed -e 's/>/>\n/g;' | grep '<rc type=' | awk -F\" '{ print $6 }' >> $titles.txt ;; "titleswithusers" | "titles" | *) cat $changes.raw | sed -e 's/>/>\n/g;' | grep '<rc type=' | awk -F\" '{ print $16 " " $6 }' >> $titles.txt ;; esac # παίρνουμε το χρονοσφραγίδα από την τελευταία γραμμή nextstartdate=`cat $changes.raw | sed -e 's/>/>\n/g;' | grep '<rc type=' | awk -F\" '{ print $16 }' | tail -n 1` # αν είναι κενό... τελειώσαμε (ποτέ δεν θα έπρεπε να συμβεί αυτό) if [[ -z "$nextstartdate" ]]; then break fi rcstartdate="$nextstartdate" sleep $logsecs done mvstartdate=$globstartdate mvenddate=$globenddate while [ 1 ]; do if [ "$do" != "moves" ] && [ "$do" != "all" ]; then break; fi echo getting moves $mvstartdate to $mvenddate # παίρνουμε τις επόμενες γραμμές από την καταγραφή μετακινήσεων curl --retry 10 -H "Expect:" -f "http://$wiki/w/api.php?action=query&list=logevents&letype=move&lelimit=500&format=xml&lestart=$mvstartdate&leend=$mvenddate" > $moves.raw if [ $? -ne 0 ]; then echo "Error $? from curl, unable to get moves, bailing" exit 1 fi if [ -e "$moves.cmp" ]; then aredone=`cmp $moves.raw $moves.cmp` if [ -z "$aredone" ]; then break; fi fi cp $moves.raw $moves.cmp cat $moves.raw >> $moves.raw.save # παίρνουμε τους τίτλους case "$snapshottype" in "fullwithusers") cat $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $8 }' >> $titles.txt cat $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $22 }' >> $titles.txt ;; "full") cat $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $22 }' | grep -v ':' >> $titles.txt cat $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | grep -v 'suppressedredirect' | awk -F\" '{ print $8 }' | grep -v ':' >> $titles.txt cat $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | grep 'suppressedredirect' | grep 'ns="0"' | awk -F\" '{ print $16 " " $8 }' >> $deletes.xml ;; "titleswithusers") cat $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $8 }' >> $titles.txt cat $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $22 }' >> $titles.txt ;; "titles") cat $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $8 }' | grep -v ':' >> $titles.txt cat $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $22 }' | grep -v ':' >> $titles.txt ;; *) ;; esac # παίρνουμε το χρονοσφραγίδα από την τελευταία γραμμή nextstartdate=`cat $moves.raw | sed -e 's/>/>\n/g;' | grep '<item logid' | awk -F\" '{ print $16 }' | tail -n 1` # αν είναι κενό... τελειώσαμε (ποτέ δεν θα έπρεπε να συμβεί αυτό) if [[ -z "$nextstartdate" ]]; then break fi mvstartdate="$nextstartdate" sleep $logsecs done # εισαγωγές impstartdate=$globstartdate impenddate=$globenddate while [ 1 ]; do if [ "$do" != "imports" ] && [ "$do" != "all" ]; then break; fi echo getting imports $impstartdate to $impenddate # παίρνουμε τις επόμενες γραμμές από την καταγραφή εισαγωγών curl --retry 10 -H "Expect:" -f "http://$wiki/w/api.php?action=query&list=logevents&letype=import&lelimit=500&format=xml&lestart=$impstartdate&leend=$impenddate" > $imports.raw if [ $? -ne 0 ]; then echo "Error $? from curl, unable to get imports, bailing" exit 1 fi if [ -e "$imports.cmp" ]; then aredone=`cmp $imports.raw $imports.cmp` if [ -z "$aredone" ]; then break; fi fi cp $imports.raw $imports.cmp cat $imports.raw >> $imports.raw.save # παίρνουμε τους τίτλους case "$snapshottype" in "fullwithusers") cat $imports.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $8 }' >> $titles.txt ;; "full") cat $imports.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $8 }' | grep -v ':' >> $titles.txt ;; "titleswithusers") cat $imports.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $8 }' >> $titles.txt ;; "titles") cat $imports.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $8 }' | grep -v ':' >> $titles.txt ;; *) ;; esac # παίρνουμε το χρονοσφραγίδα από την τελευταία γραμμή nextstartdate=`cat $imports.raw | sed -e 's/>/>\n/g;' | grep '<item logid' | awk -F\" '{ print $16 }' | tail -n 1` # αν είναι κενό... τελειώσαμε (ποτέ δεν θα έπρεπε να συμβεί αυτό) if [[ -z "$nextstartdate" ]]; then break fi impstartdate="$nextstartdate" sleep $logsecs done # επιφορτώσεις upstartdate=$globstartdate upenddate=$globenddate while [ 1 ]; do if [ "$do" != "uploads" ] && [ "$do" != "all" ]; then break; fi echo getting uploads $upstartdate to $upenddate # παίρνουμε τις επόμενες γραμμές από την καταγραφή επιφορτώσεων curl --retry 10 -H "Expect:" -f "http://$wiki/w/api.php?action=query&list=logevents&letype=upload&lelimit=500&format=xml&lestart=$upstartdate&leend=$upenddate" > $uploads.raw if [ $? -ne 0 ]; then echo "Error $? from curl, unable to get uploads, bailing" exit 1 fi if [ -e "$uploads.cmp" ]; then aredone=`cmp $uploads.raw $uploads.cmp` if [ -z "$aredone" ]; then break; fi fi cp $uploads.raw $uploads.cmp cat $uploads.raw >> $uploads.raw.save # παίρνουμε τους τίτλους case "$snapshottype" in "fullwithusers") cat $uploads.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $8 }' >> $titles.txt ;; "full") cat $uploads.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $8 }'| grep -v ':' >> $titles.txt ;; "titleswithusers") cat $uploads.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $8 }' >> $titles.txt ;; "titles") cat $uploads.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $8 }'| grep -v ':' >> $titles.txt ;; *) ;; esac # παίρνουμε το χρονοσφραγίδα από την τελευταία γραμμή nextstartdate=`cat $uploads.raw | sed -e 's/>/>\n/g;' | grep '<item logid' | awk -F\" '{ print $16 }' | tail -n 1` # αν είναι κενό... τελειώσαμε (ποτέ δεν θα έπρεπε να συμβεί αυτό) if [[ -z "$nextstartdate" ]]; then break fi upstartdate="$nextstartdate" sleep $logsecs done if [ "$snapshottype" == "fullwithusers" ] || [ "$snapshottype" == "full" ]; then # σελίδες με τους ορισμένους τίτλους mv $titles.txt $titles.txt-temp cat $titles.txt-temp | ./sort.pl | ./uniq.pl > $titles.txt count=1 while [ 1 ]; do if [ "$do" != "pages" ] && [ "$do" != "all" ]; then break; fi echo getting pages $count to $count+500 # επόμενες 500 tail -n +$count $titles.txt | head -n 500 > $titles.500.txt left=`cat $titles.500.txt | wc -l` if [ $left == "0" ]; then break; fi count=$(( $count+500 )) curl --retry 10 -H "Expect:" -f -F "curonly=1" -F "wpDownload=1" -F "pages=<$titles.500.txt" "http://$wiki/w/index.php?title=Special:Export&action=submit" > $pages.xml-temp if [ $? -ne 0 ]; then echo "Error $? from curl, unable to get xml pages, bailing" exit 1 fi if [ -e "$pages.xml" ]; then mv $pages.xml $pages.xml-old fi # put it in front of the older batch, and back into the same filename # (so most recent revs are at the beginning) if [ -e "$pages.xml-old" ]; then cat $pages.xml-temp $pages.xml-old > $pages.xml else cat $pages.xml-temp > $pages.xml fi sleep $pagesecs done fi # διαγραφές delstartdate=$globstartdate delenddate=$globenddate while [ 1 ]; do if [ "$do" != "deletes" ] && [ "$do" != "all" ]; then break; fi echo getting deletes $delstartdate to $delenddate # get next lines from delete log curl --retry 10 -H "Expect:" -f "http://$wiki/w/api.php?action=query&list=logevents&letype=delete&lelimit=500&format=xml&lestart=$delstartdate&leend=$delenddate" > $deletes.raw if [ $? -ne 0 ]; then echo "Error $? from curl, unable to get deletes, bailing" exit 1 fi if [ -e "$deletes.cmp" ]; then aredone=`cmp $deletes.raw $deletes.cmp` if [ -z "$aredone" ]; then break; fi fi cp $deletes.raw $deletes.cmp cat $deletes.raw >> $deletes.raw.save # create new batch of timestamp, title for each delete record # we don't bother to filter these based on snapshot type cat $deletes.raw | sed -e 's/>/>\n/g;' | grep '<item logid' | grep -v 'action="revision"' | awk -F\" '{ print $16 " " $8 }' >> $deletes.xml # παίρνουμε το χρονοσφραγίδα από την τελευταία γραμμή nextstartdate=`cat $deletes.raw | sed -e 's/>/>\n/g;' | grep '<item logid' | awk -F\" '{ print $16 }' | tail -n 1` # αν είναι κενό... τελειώσαμε (ποτέ δεν θα έπρεπε να συμβεί αυτό) if [[ -z "$nextstartdate" ]]; then break fi delstartdate="$nextstartdate" sleep $logsecs done if [ "$do" != "merges" ] && [ "$do" != "all" ]; then echo "done!" exit 0; fi # merges of new pages, changed pages, and deletes if [ ! -e ./merge-pages-main-and-export.pl ] || [ ! -e ./merge-deletes.pl ]; then echo "One or more of the required scripts for this file are missing:" echo "merge-pages-main-and-export.pl or merge-deletes.pl." echo "Please make sure that they are all in the directory from where you are giving the" echo "command $0. " exit 1 fi if [ "$snapshottype" == "titleswithusers" ] || [ "$snapshottype" == "titles" ]; then if [ ! -e "$lastfull" ] && [ ! -e "$lastfull.bz2" ]; then echo "$lastfull{.bz2} does not exist. Please copy your last full incremental into this file" echo "and run this script again as domerges.sh $1 $2 $3 in order to finish this last step." echo "You can either compress it as a bz2 file or leave it uncompressed." exit 1 fi if [ -e "$lastfull.bz2" ]; then compressed="true" fi # full xml files have this in their first line if [ -z "$compressed" ]; then isxml=`head -1 $lastfull | egrep '<page|<mediawiki'` else isxml=`bzcat $lastfull.bz2 | head -1 | egrep '<page|<mediawiki'` fi if [ ! -z "$isxml" ]; then echo "generating titles from standard xml file..." # we must get the ts and title from the xml file and stuff it somewhere. if [ -z "$compressed" ]; then cat "$lastfull" | ./full2titles.pl > "$lastfull.titles" mv "$lastfull" "$lastfull.sav" else bzcat "$lastfull" | ./full2titles.pl > "$lastfull.titles" mv "$lastfull.bz2" "$lastfull.bz2.sav" fi mv "$lastfull.titles" "$lastfull" compressed="" fi # lose those pages from the full xml dump if the pages in full are older. then cat the rest # (from the exports that are newer) on the end. echo rewriting full titles list if [ -z "$compressed" ]; then cat "$lastfull" | ./merge-pages-main-and-export-titles.pl $titles.txt > $full-titles.xml-temp else cat "$lastfull.bz2" | ./merge-pages-main-and-export-titles.pl $titles.txt > $full-titles.xml-temp fi # process the deletes echo processing deletes cat $full-titles.xml-temp | ./merge-deletes-titles.pl $deletes.xml > $full-titles.xml echo copying files into place # set up new full to be the next file we use cp $full-titles.xml $lastfull # don't do this til the end, in case of failure echo "$lastdaterun" > "$lastrun" cp $full-titles.xml full-titles.$ext.xml echo new full titles list is now in place at full-titles.$ext.xml and $lastfull else # lose those pages from the full xml dump if the pages in full are older. then cat the rest # (from the exports that are newer) on the end. echo rewriting full dump if [ -e "$lastfull" ]; then cat "$lastfull" | ./merge-pages-main-and-export.pl $pages.xml | bzip2 > $full.xml.bz2-temp elif [ -e "$lastfull.bz2" ]; then bzcat "$lastfull.bz2" | ./merge-pages-main-and-export.pl $pages.xml | bzip2 > $full.xml.bz2-temp else echo ""$lastfull{.bz2}" does not exist. Please copy your last full incremental into this file" echo "and run this script again as domerges.sh $1 $2 $3 in order to finish this last step." echo "You can copy the uncompressed zml file or you can copy it as a bz2 file." exit 1 fi # process the deletes echo processing deletes bzcat $full.xml.bz2-temp | ./merge-deletes.pl $deletes.xml | bzip2 > $full.xml.bz2 echo copying files into place # set up new full to be the next file we use cp $full.xml.bz2 $lastfull.bz2 # don't do this til the end, in case of failure echo "$lastdaterun" > last_run cp $full.xml.bz2 full.$ext.xml.bz2 echo new full is now in place at full.$ext.xml.bz2 and $lastfull.bz2 fi # done! echo "done!" exit 0
[
] sort.pl
#!/usr/bin/perl binmode(STDOUT, ":utf8"); binmode(STDIN, ":utf8"); use encoding(UTF8); use utf8; print foreach sort <STDIN>;
[
] uniq.pl
#!/usr/bin/perl binmode(STDOUT, ":utf8"); binmode(STDIN, ":utf8"); use encoding(UTF8); use utf8; $prevline=""; while (<STDIN>) { if ($_ ne $prevline) { print; $prevline=$_; } }
[
] merge-pages-main-and-export.pl
#!/usr/bin/perl binmode(STDOUT, ":utf8"); binmode(STDIN, ":utf8"); use encoding(UTF8); use utf8; # παίρνουμε τους καινούργιους τίτλους # διαβάζουμε το παλιό αρχείο # όταν βρούμε σελίδα με τίτλο που δεν είναι στον κατάλογο, την γράφουμε unless (@ARGV) { die "Usage: $0 filename-of-exported-pages\n"; } $filename=shift; # hash of titles with timestamps from file... open(FILE,'<',$filename); binmode(FILE, ":utf8"); while (<FILE>) { if (/<title>(.*)<\/title>/) { $temptitle=$1; } elsif (/<timestamp>(.*)<\/timestamp>/) { $temptimestamp=$1; $titles{$temptitle}=$temptimestamp; } } close(FILE); sub compareem { my($ts1,$ts2) = @_; $ts1 =~ s/[-:TZ]//g ; $ts2 =~ s/[-:TZ]//g ; return $ts1 <=> $ts2; } $text=""; while (<STDIN>) { $text.=$_; if (/<page>/) { $text = $_; $title=""; } elsif (/<title>(.*)<\/title>/) { $title=$1; } elsif (/<timestamp>(.*)<\/timestamp>/) { $timestamp=$1; } elsif (/<\/page>/) { $result = compareem($titles{$title}, $timestamp); # compare our timestamp with the one from titles... if ours is later, we write it if ($result < 0) { print $text; if ($titles{$title}) { $wrote{$title}=1; } } else { $skipped{$title}=1; } } } # reopen file, we are going to read the stuff from it and skip the titles that we # wrote already but write the rest open(FILE,'<',$filename); binmode(FILE, ":utf8"); while (<FILE>) { $text.=$_; if (/<page>/) { $text = $_; $title=""; } elsif (/<title>(.*)<\/title>/) { $title=$1; } elsif (/<\/page>/) { # full file had the page, but it was an older copy if ($skipped{$title} > 0) { print $text; } # full file didn't have the page. elsif (!$wrote{$title}) { print $text; } # full file had the page and it was newer... } } close(FILE);
[
] merge-deletes.pl
#!/usr/bin/perl binmode(STDOUT, ":utf8"); binmode(STDIN, ":utf8"); use encoding(UTF8); use utf8; # παίρνουμε τις διαγραφές # διαβάζουμε το παλιό αρχείο # όταν βρούμε σελίδα με τίτλο που δεν διαγράφτηκε αργότερα, την γράφουμε unless (@ARGV) { die "Usage: $0 filename-of-deletions\n"; } $filename=shift; # hash of titles with timestamps from file... open(FILE,'<',$filename) or die("can't open file $filename\n"); binmode(FILE, ":utf8"); while (<FILE>) { chomp; ($timestamp,$title) = split(/ /,$_,2); $titles{$title}=$timestamp; } close(FILE); sub compareem { my($ts1,$ts2) = @_; $ts1 =~ s/[-:TZ]//g ; $ts2 =~ s/[-:TZ]//g ; return $ts1 <=> $ts2; } $text=""; while (<STDIN>) { $text.=$_; if (/<page>/) { $text = $_; $title=""; } elsif (/<title>(.*)<\/title>/) { $title=$1; } elsif (/<timestamp>(.*)<\/timestamp>/) { $timestamp=$1; } elsif (/<\/page>/) { $result = compareem($titles{$title}, $timestamp); # compare our timestamp with the one from titles... if ours is later, we write it if ($result < 0) { print $text; if ($titles{$title}) { $wrote{$title}=1; } } else { $skipped{$title}=1; } } }
[
] config.txt
# configuration file for wiki snapshots # change me to your project #wiki="en.wiktionary.org" wiki="el.wiktionary.org" #change me to name of export page expurl='Special:Export' #expurl='Ειδικό:Export' #change me to type of snapshot #one of: fullwithusers full titles # fullwithusers means current copies of everything # full means current copies of namespace 0 # titleswithusers means just the current titles of everything # titles means just the current titles of namespace 0 snapshottype=fullwithusers #how many seconds to sleep between log requests logsecs=2 #how many seconds to sleep between requests of 500 pages pagesecs=5 #work dir where all intermediate files will live tmp="./tmp" #name of file where snapshot will be stored snapshot="last_full.xml" #directory where snapshot will be stored snapshotdir="." #name of file where we will keep date of last run lastrun="last_run"