Χρήστης:AtouBot/getrcs.sh

Από Βικιλεξικό
Μετάβαση σε: πλοήγηση, αναζήτηση

Δείτε επίσης: Χρήστης:AtouBot/getrcs.sh/docs


Πίνακας περιεχομένων

[] getrcs.sh

#!/bin/bash
# TODO
#
# check for available disk space at beginning of run
# of any of the three parts. We will need at least size($lastfull) bytes
# plus (finding out in a minute)
 
usage() {
  echo "Usage: $0 startdate  endate [configfile]"
  echo "where startdate is latest date from which to get changes"
  echo "and enddate is the earliest date, in the local timezone."
  echo "The base date may be specified as either today, or lastrun,"
  echo "where lastrun is the latest date you got changes from"
  echo "during the previous run."
  echo 
  echo "For example:"
  echo "$0 today today-3d"
  echo "$0 today-1h today-5h"
  echo "$0 today lastrun"
  echo "If you omit the d or h the increment is interpreted as days"
  echo
  echo "Alternatively you can specify absolute timestamps."
  echo "These must be in the format yyyy-mm-ddThh:mm:ssZ"
  echo "For example:"
  echo "$0 2008-02-06T08:54:06Z  2008-01-23T08:00:00Z"
  echo "In this case the times are interpreted as UTC times."
  echo
  echo "The optional configfile argument tells the script to use"
  echo "the config file you specify instead of the default config.txt"
  exit 1
}
 
if [ -z "$1"  ] || [ -z "$2" ]; then
  usage
fi
 
if [ ! -z "$3" ]; then
    if [ -e "$3" ]; then
        source "$3"
    else
        echo "Specified config file $3 does not exist."
        usage
    fi
else
    source ./config.txt
fi
 
 
if [ ! -e "./sort.pl" ] || [ ! -e "./uniq.pl" ]; then
      echo "You are missing one or both of the files sort.pl or uniq.pl which should be in the same directory"
      echo "from which you run this command. Please put them in place and run this again."
      exit 1
fi
 
usage_lastrun() {
 
    echo "In order to use lastrun+-(d|h), you need to have the timestamp of the last run"
    echo "stored in the file $lastrun in the current directory.  To get the appropriate"
    echo "timestamp, run"
    echo 'date +%s -d "yyyy-mm-dd hh:mm:ss +0000" > $lastrun' 
    echo "Then run this script again."
    exit 1
}
 
checkformat() {
    local d
 
    d="$1"
 
    if [ -z "$d" ]; then
        secs=`date +%s`
        return $secs
    fi  
 
    hasZ=`echo $1 | grep Z`
    if [ ! -z "$hasZ" ]; then
       # μορφή ως: 2008-01-23T08:00:00Z
       # μετατροπή σε: 2008-01-23 08:00:00 +0000
       reformatted=`echo $1 | sed -e 's/T/ /; s/Z/ +0000/;'`
       secs=`date --date="$reformatted" +%s`
       return $secs
    fi
 
    minus=`echo "$d" | grep -e '-'`
    plus=`echo "$d" | grep -e '+'`
    if [ ! -z "$minus" ]; then 
        op="-"
    elif [ ! -z "$plus" ]; then
        op="+"
    else
        op=""
    fi
    if [ -z "$op" ]; then
        basedate=$d
        incr=0
        incrtype="d"
    else
        basedate=`echo $d | awk -F"$op" '{ print $1 }'`
        incr=`echo $d | awk -F"$op" '{ print $2 }'`
        incrtype="d"
    fi
    if [ ! -z "$incr" ]; then
        day=`echo "$incr" | grep 'd'`
        hour=`echo "$incr" | grep 'h'`
        if [ ! -z "$day" ]; then 
            incrtype="d"
        elif [ ! -z "$hour" ]; then
            incrtype="h"
        fi
        incr=`echo $incr | sed -e "s/$incrtype//"`
        if [ -z "$incr" ]; then
            incr='0'
        fi
    fi
    case $basedate in
        'today')
            today=`date -u +"%Y-%m-%d %H:%M:%S +0000"`
            secs=`date +%s -d "$today"`
            ;;
        'lastrun')
            if [ ! -e "$lastrun" ]; then
                usage_lastrun
                exit 1
            fi
            lastdaterun=`cat $lastrun`
            testdate=`date -d @"$lastdaterun"`
            if [ $? -ne 0 ]; then
                usage_lastrun
            fi
            secs=`date +%s -d @"$lastdaterun"`
            ;;
        *)
            usage
            ;;
    esac
    case $incrtype in
        'd')
            incr=$(( $incr*86400 ))
        ;;
        'h')
            incr=$(( $incr*3600 ))
        ;;
        *)
        ;;
    esac
    case $op in 
        '-')
            secs=$(( $secs-$incr ))
            ;;
        '+')
            secs=$(( $secs+$incr ))
            ;;
        '')
            ;;
        *)
            usage
    esac
    return 0
}
 
checkformat "$1"
startdatesecs=$secs
checkformat "$2"
enddatesecs=$secs
 
ext=`date +%m-%d-%Y -d @$startdatesecs`
 
globstartdate=`date -u -d @$startdatesecs +"%Y-%m-%dT%H:%M:%SZ"`
globenddate=`date -u -d @$enddatesecs +"%Y-%m-%dT%H:%M:%SZ"`
 
lastdaterun="$startdatesecs"
me=`basename $0`
 
mkdir -p $tmp
changes="$tmp/changes.$ext"
moves="$tmp/moves.$ext"
imports="$tmp/imports.$ext"
uploads="$tmp/uploads.$ext"
deletes="$tmp/deletes.$ext"
pages="$tmp/pages.$ext"
titles="$tmp/titles.$ext"
full="$tmp/full.$ext"
lastfull="$snapshotdir/$snapshot"
 
case $me in 
    'getchanges.sh')
        do="changes"
        ;;
    'getmoves.sh')
        do="moves"
        ;;
    'getimports.sh')
        do="imports"
        ;;
    'getuploads.sh')
        do="uploads"
        ;;
    'getdeletes.sh')
        do="deletes"
        ;;
    'getpages.sh')
        do="pages"
        ;;
    'domerges.sh')
        do="merges"
        ;;
    *)
        rm -f  $titles.* $pages.* $deletes.* $moves.* $imports.* $uploads.* $changes.* 
        do="all"
        ;;
esac
if [ "$do" != "all" ]; then
    rm -f "$tmp/$do".*
fi
 
# πρόσφατες αλλαγές
rcstartdate=$globstartdate
rcenddate=$globenddate
 
while [ 1 ]; do
  if [ "$do" != "changes" ] && [ "$do" != "all" ]; then
    break;
  fi    
 
  echo getting recent changes $rcstartdate to $rcenddate
  # παίρνουμε τις επόμενες γραμμές από την καταγραφή πρόσφατων αλλαγών
  case "$snapshottype" in 
      "fullwithusers" | "titleswithusers" )
          curl --retry 10 -H "Expect:" -f "http://$wiki/w/api.php?action=query&list=recentchanges&rclimit=500&rctype=new|edit&format=xml&rcstart=$rcstartdate&rcend=$rcenddate"  >  $changes.raw
          ;;
      "full" | "titles")
          curl --retry 10 -H "Expect:" -f "http://$wiki/w/api.php?action=query&list=recentchanges&rclimit=500&rctype=new|edit&format=xml&rcstart=$rcstartdate&rcend=$rcenddate&rcnamespace=0"  >  $changes.raw
          ;;
      *)
          echo "Unknown snapshot type. Please check your configuration file and"
          echo "run this step again."
          exit 1
  esac
 
  if [ $? -ne 0 ]; then
      echo "Error $? from curl, unable to get recent changes, bailing"
      exit 1
  fi
  if [ -e "$changes.cmp" ]; then
      aredone=`cmp $changes.raw $changes.cmp`
      if [ -z "$aredone" ]; then
          break;
      fi
  fi
  cp $changes.raw $changes.cmp
  cat $changes.raw >> $changes.raw.save
 
  # παίρνουμε τους τίτλους
  case "$snapshottype" in 
      "fullwithusers" | "full")
      cat  $changes.raw | sed -e 's/>/>\n/g;' | grep '<rc type=' | awk -F\" '{ print $6 }'  >> $titles.txt
      ;;
      "titleswithusers" | "titles" | *)
      cat  $changes.raw | sed -e 's/>/>\n/g;' | grep '<rc type=' | awk -F\" '{ print $16 " " $6 }'  >> $titles.txt
      ;;
  esac
 
  # παίρνουμε το χρονοσφραγίδα από την τελευταία γραμμή
  nextstartdate=`cat $changes.raw |   sed -e 's/>/>\n/g;' | grep '<rc type=' | awk -F\" '{ print $16 }' | tail -n 1`
  # αν είναι κενό... τελειώσαμε  (ποτέ δεν θα έπρεπε να συμβεί αυτό)
  if [[ -z "$nextstartdate" ]]; then
    break
  fi
 
  rcstartdate="$nextstartdate"
  sleep $logsecs
done
 
mvstartdate=$globstartdate
mvenddate=$globenddate
 
while [ 1 ]; do
    if [ "$do" != "moves" ] && [ "$do" != "all" ]; then
        break;
    fi
 
    echo getting moves $mvstartdate to $mvenddate
    # παίρνουμε τις επόμενες γραμμές από την καταγραφή μετακινήσεων
    curl --retry 10 -H "Expect:" -f "http://$wiki/w/api.php?action=query&list=logevents&letype=move&lelimit=500&format=xml&lestart=$mvstartdate&leend=$mvenddate"  >  $moves.raw
    if [ $? -ne 0 ]; then
        echo "Error $? from curl, unable to get moves, bailing"
        exit 1
    fi
 
    if [ -e "$moves.cmp" ]; then
        aredone=`cmp $moves.raw $moves.cmp`
        if [ -z "$aredone" ]; then
            break;
        fi
    fi
    cp $moves.raw $moves.cmp
    cat $moves.raw >> $moves.raw.save
 
    # παίρνουμε τους τίτλους
    case "$snapshottype" in 
      "fullwithusers")
          cat  $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $8 }'   >> $titles.txt
          cat  $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $22 }'  >> $titles.txt
          ;;
      "full")
          cat  $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $22 }' | grep -v ':'  >> $titles.txt
          cat  $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | grep -v 'suppressedredirect' | awk -F\" '{ print $8 }' | grep -v ':'  >> $titles.txt
          cat  $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | grep 'suppressedredirect' | grep 'ns="0"' |  awk -F\" '{ print $16 " " $8 }'  >> $deletes.xml
          ;;
      "titleswithusers")
          cat  $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $8 }'   >> $titles.txt
          cat  $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $22 }'  >> $titles.txt
          ;;
      "titles")
          cat  $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $8 }' | grep -v ':'  >> $titles.txt
          cat  $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $22 }' | grep -v ':'  >> $titles.txt
          ;;
      *)
          ;;
    esac
    # παίρνουμε το χρονοσφραγίδα από την τελευταία γραμμή
    nextstartdate=`cat $moves.raw |   sed -e 's/>/>\n/g;' | grep '<item logid' | awk -F\" '{ print $16 }' | tail -n 1`
    # αν είναι κενό... τελειώσαμε (ποτέ δεν θα έπρεπε να συμβεί αυτό)
    if [[ -z "$nextstartdate" ]]; then
        break 
    fi  
 
    mvstartdate="$nextstartdate"
    sleep $logsecs
done
 
# εισαγωγές
impstartdate=$globstartdate
impenddate=$globenddate
 
while [ 1 ]; do
    if [ "$do" != "imports" ] && [ "$do" != "all" ]; then
        break;
    fi
 
    echo getting imports $impstartdate to $impenddate
    # παίρνουμε τις επόμενες γραμμές από την καταγραφή εισαγωγών
    curl --retry 10 -H "Expect:" -f "http://$wiki/w/api.php?action=query&list=logevents&letype=import&lelimit=500&format=xml&lestart=$impstartdate&leend=$impenddate"  >  $imports.raw
    if [ $? -ne 0 ]; then
        echo "Error $? from curl, unable to get imports, bailing"
        exit 1
    fi
    if [ -e "$imports.cmp" ]; then
        aredone=`cmp $imports.raw $imports.cmp`
        if [ -z "$aredone" ]; then
            break;
        fi
    fi
    cp $imports.raw $imports.cmp
    cat $imports.raw >>  $imports.raw.save
 
    # παίρνουμε τους τίτλους
    case "$snapshottype" in 
      "fullwithusers")
            cat  $imports.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $8 }'   >> $titles.txt
            ;;
        "full")
            cat  $imports.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $8 }' | grep -v ':'  >> $titles.txt
            ;;
      "titleswithusers")
            cat  $imports.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $8 }'   >> $titles.txt
            ;;
        "titles")
            cat  $imports.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $8 }' | grep -v ':'  >> $titles.txt
            ;;
        *)
            ;;
    esac
 
    # παίρνουμε το χρονοσφραγίδα από την τελευταία γραμμή
    nextstartdate=`cat $imports.raw |   sed -e 's/>/>\n/g;' | grep '<item logid' | awk -F\" '{ print $16 }' | tail -n 1`
    # αν είναι κενό... τελειώσαμε (ποτέ δεν θα έπρεπε να συμβεί αυτό)
    if [[ -z "$nextstartdate" ]]; then
        break 
    fi  
    impstartdate="$nextstartdate"
    sleep $logsecs
done
 
 
# επιφορτώσεις
upstartdate=$globstartdate
upenddate=$globenddate
 
while [ 1 ]; do
 
    if [ "$do" != "uploads" ] && [ "$do" != "all" ]; then
        break;
    fi
 
    echo getting uploads $upstartdate to $upenddate
    # παίρνουμε τις επόμενες γραμμές από την καταγραφή επιφορτώσεων
    curl --retry 10 -H "Expect:" -f "http://$wiki/w/api.php?action=query&list=logevents&letype=upload&lelimit=500&format=xml&lestart=$upstartdate&leend=$upenddate"  >  $uploads.raw
    if [ $? -ne 0 ]; then
        echo "Error $? from curl, unable to get uploads, bailing"
        exit 1
    fi
    if [ -e "$uploads.cmp" ]; then
        aredone=`cmp $uploads.raw $uploads.cmp`
        if [ -z "$aredone" ]; then
            break;
        fi
    fi
    cp $uploads.raw $uploads.cmp
    cat $uploads.raw >>  $uploads.raw.save
 
    # παίρνουμε τους τίτλους
    case "$snapshottype" in 
        "fullwithusers")
            cat  $uploads.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $8 }'  >> $titles.txt
            ;;
        "full")
            cat  $uploads.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $8 }'| grep -v ':'  >> $titles.txt
            ;;
        "titleswithusers")
            cat  $uploads.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $8 }'  >> $titles.txt
            ;;
        "titles")
            cat  $uploads.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $8 }'| grep -v ':'  >> $titles.txt
            ;;
        *)
            ;;
    esac
    # παίρνουμε το χρονοσφραγίδα από την τελευταία γραμμή
    nextstartdate=`cat $uploads.raw |   sed -e 's/>/>\n/g;' | grep '<item logid' | awk -F\" '{ print $16 }' | tail -n 1`
    # αν είναι κενό... τελειώσαμε (ποτέ δεν θα έπρεπε να συμβεί αυτό)
    if [[ -z "$nextstartdate" ]]; then
        break 
    fi  
    upstartdate="$nextstartdate"
    sleep $logsecs
done
 
if [ "$snapshottype" == "fullwithusers" ] || [ "$snapshottype" == "full" ]; then
 
  # σελίδες με τους ορισμένους τίτλους
    mv $titles.txt $titles.txt-temp
    cat $titles.txt-temp | ./sort.pl | ./uniq.pl > $titles.txt
 
    count=1
    while [ 1 ]; do
 
        if [ "$do" != "pages" ] && [ "$do" != "all" ]; then
            break;
        fi
 
        echo getting pages $count to $count+500
 
        # επόμενες 500
        tail -n +$count $titles.txt | head -n 500 > $titles.500.txt
        left=`cat $titles.500.txt | wc -l`
        if [ $left == "0" ]; then
            break;
        fi
        count=$(( $count+500 ))
 
        curl --retry 10 -H "Expect:" -f -F "curonly=1" -F "wpDownload=1" -F "pages=<$titles.500.txt" "http://$wiki/w/index.php?title=Special:Export&action=submit" > $pages.xml-temp
 
        if [ $? -ne 0 ]; then
            echo "Error $? from curl, unable to get xml pages, bailing"
            exit 1
        fi
        if [ -e "$pages.xml" ]; then
            mv $pages.xml $pages.xml-old
        fi
        # put it in front of the older batch, and back into the same filename 
        # (so most recent revs are at the beginning)
        if [ -e "$pages.xml-old" ]; then
            cat $pages.xml-temp $pages.xml-old > $pages.xml
        else
            cat $pages.xml-temp > $pages.xml
        fi
        sleep $pagesecs
    done
 
fi
 
# διαγραφές
delstartdate=$globstartdate
delenddate=$globenddate
 
while [ 1 ]; do
 
    if [ "$do" != "deletes" ] && [ "$do" != "all" ]; then
        break;
    fi
 
    echo getting deletes $delstartdate to $delenddate
 
    # get next lines from delete log
    curl --retry 10 -H "Expect:" -f "http://$wiki/w/api.php?action=query&list=logevents&letype=delete&lelimit=500&format=xml&lestart=$delstartdate&leend=$delenddate"  >  $deletes.raw  
    if [ $? -ne 0 ]; then
        echo "Error $? from curl, unable to get deletes, bailing"
        exit 1
    fi
    if [ -e "$deletes.cmp" ]; then
        aredone=`cmp $deletes.raw $deletes.cmp`
        if [ -z "$aredone" ]; then
            break;
        fi
    fi
    cp $deletes.raw $deletes.cmp
    cat $deletes.raw >>  $deletes.raw.save
 
    # create new batch of timestamp, title for each delete record
    # we don't bother to filter these based on snapshot type
    cat  $deletes.raw | sed -e 's/>/>\n/g;' | grep '<item logid' | grep -v 'action="revision"' | awk -F\" '{ print $16 " " $8 }'  >> $deletes.xml
 
    # παίρνουμε το χρονοσφραγίδα από την τελευταία γραμμή
    nextstartdate=`cat $deletes.raw |   sed -e 's/>/>\n/g;' | grep '<item logid' | awk -F\" '{ print $16 }' | tail -n 1`
    # αν είναι κενό... τελειώσαμε (ποτέ δεν θα έπρεπε να συμβεί αυτό)
    if [[ -z "$nextstartdate" ]]; then
        break 
    fi  
 
    delstartdate="$nextstartdate"
    sleep $logsecs
done
 
if [ "$do" != "merges" ] && [ "$do" != "all" ]; then
  echo "done!"
  exit 0;
fi
 
# merges of new pages, changed pages, and deletes
 
if [ ! -e ./merge-pages-main-and-export.pl ] || [ ! -e ./merge-deletes.pl ]; then
    echo "One or more of the required scripts for this file are missing:"
    echo "merge-pages-main-and-export.pl  or  merge-deletes.pl."
    echo "Please make sure that they are all in the directory from where you are giving the"
    echo "command $0. "
    exit 1
fi
 
if [ "$snapshottype" == "titleswithusers" ] || [ "$snapshottype" == "titles" ]; then
 
    if [ ! -e "$lastfull" ] && [ ! -e "$lastfull.bz2" ]; then
        echo "$lastfull{.bz2} does not exist.  Please copy your last full incremental into this file"
        echo "and run this script again as   domerges.sh $1 $2 $3 in order to finish this last step."
        echo "You can either compress it as a bz2 file or leave it uncompressed."
        exit 1
    fi
    if [ -e "$lastfull.bz2" ]; then 
       compressed="true"
    fi
 
    # full xml files have this in their first line
    if [ -z "$compressed" ]; then
        isxml=`head -1 $lastfull | egrep '<page|<mediawiki'`
    else 
        isxml=`bzcat $lastfull.bz2 | head -1 | egrep '<page|<mediawiki'`
    fi
    if [ ! -z "$isxml" ]; then
        echo "generating titles from standard xml file..."
        # we must get the ts and title from the xml file and stuff it somewhere. 
        if [ -z "$compressed" ]; then
            cat "$lastfull" | ./full2titles.pl > "$lastfull.titles"
            mv "$lastfull" "$lastfull.sav"
        else    
            bzcat "$lastfull" | ./full2titles.pl > "$lastfull.titles"
            mv "$lastfull.bz2" "$lastfull.bz2.sav"
        fi
        mv "$lastfull.titles" "$lastfull"
        compressed=""
    fi
 
    # lose those pages from the full xml dump if the pages in full are older. then cat the rest 
    # (from the exports that are newer) on the end.
    echo rewriting full titles list 
    if [ -z "$compressed" ]; then
        cat "$lastfull" | ./merge-pages-main-and-export-titles.pl $titles.txt > $full-titles.xml-temp
    else
        cat "$lastfull.bz2" | ./merge-pages-main-and-export-titles.pl $titles.txt > $full-titles.xml-temp
    fi
    # process the deletes
    echo processing deletes
    cat $full-titles.xml-temp | ./merge-deletes-titles.pl $deletes.xml > $full-titles.xml
    echo copying files into place
    # set up new full to be the next file we use
    cp $full-titles.xml $lastfull
    # don't do this til the end, in case of failure
    echo "$lastdaterun" > "$lastrun"
    cp $full-titles.xml full-titles.$ext.xml
    echo new full titles list is now in place at full-titles.$ext.xml and $lastfull
 
else
 
    # lose those pages from the full xml dump if the pages in full are older. then cat the rest 
    # (from the exports that are newer) on the end.
    echo rewriting full dump 
    if [ -e "$lastfull" ]; then
        cat "$lastfull" | ./merge-pages-main-and-export.pl $pages.xml | bzip2 > $full.xml.bz2-temp
    elif [ -e "$lastfull.bz2" ]; then
        bzcat "$lastfull.bz2" | ./merge-pages-main-and-export.pl $pages.xml | bzip2 >  $full.xml.bz2-temp
    else
        echo ""$lastfull{.bz2}" does not exist.  Please copy your last full incremental into this file"
        echo "and run this script again as   domerges.sh $1 $2 $3 in order to finish this last step."
        echo "You can copy the uncompressed zml file or you can copy it as a bz2 file."
        exit 1
    fi
 
    # process the deletes
    echo processing deletes
    bzcat $full.xml.bz2-temp | ./merge-deletes.pl $deletes.xml | bzip2 > $full.xml.bz2
    echo copying files into place
    # set up new full to be the next file we use
    cp $full.xml.bz2 $lastfull.bz2
    # don't do this til the end, in case of failure
    echo "$lastdaterun" > last_run
    cp $full.xml.bz2 full.$ext.xml.bz2
    echo new full is now in place at full.$ext.xml.bz2 and $lastfull.bz2
 
fi
 
# done!
echo "done!"
exit 0

[] sort.pl

#!/usr/bin/perl
 
binmode(STDOUT, ":utf8");
binmode(STDIN, ":utf8");
 
use encoding(UTF8);
use utf8;
 
print foreach sort <STDIN>;

[] uniq.pl

#!/usr/bin/perl
 
binmode(STDOUT, ":utf8");
binmode(STDIN, ":utf8");
 
use encoding(UTF8);
use utf8;
 
$prevline="";
while (<STDIN>) { 
    if ($_ ne $prevline) {
        print;
        $prevline=$_;
    }
}

[] merge-pages-main-and-export.pl

#!/usr/bin/perl
 
binmode(STDOUT, ":utf8");
binmode(STDIN, ":utf8");
 
use encoding(UTF8);
use utf8;
# παίρνουμε τους καινούργιους τίτλους
# διαβάζουμε το παλιό αρχείο 
# όταν βρούμε σελίδα με τίτλο που δεν είναι στον κατάλογο, την γράφουμε
 
unless (@ARGV) {
    die "Usage: $0 filename-of-exported-pages\n";
}
 
$filename=shift;
 
# hash of titles with timestamps from file...
open(FILE,'<',$filename);
binmode(FILE, ":utf8");
while (<FILE>) {
    if (/<title>(.*)<\/title>/) {
        $temptitle=$1;
    }
    elsif (/<timestamp>(.*)<\/timestamp>/) {
        $temptimestamp=$1;
        $titles{$temptitle}=$temptimestamp;
    }
}
close(FILE);
 
sub compareem {
    my($ts1,$ts2) = @_;
    $ts1 =~ s/[-:TZ]//g ;
    $ts2 =~ s/[-:TZ]//g ;
    return $ts1 <=> $ts2;
}
 
$text="";
while (<STDIN>) {
    $text.=$_;
    if (/<page>/) {
        $text = $_;
        $title="";
    }
    elsif (/<title>(.*)<\/title>/) {
        $title=$1;
    }
    elsif (/<timestamp>(.*)<\/timestamp>/) {
        $timestamp=$1;
    }
    elsif (/<\/page>/) {
        $result = compareem($titles{$title}, $timestamp);
        # compare our timestamp with the one from titles... if ours is later, we write it
        if ($result < 0) {
            print $text;
            if ($titles{$title}) {
                $wrote{$title}=1;
            }
        }
        else {
            $skipped{$title}=1;
        }
    }
}
 
# reopen file, we are going to read the stuff from it and skip the titles that we
# wrote already but write the rest
open(FILE,'<',$filename);
binmode(FILE, ":utf8");
while (<FILE>) {
    $text.=$_;
    if (/<page>/) {
        $text = $_;
        $title="";
    }
    elsif (/<title>(.*)<\/title>/) {
        $title=$1;
    }
    elsif (/<\/page>/) {
        # full file had the page, but it was an older copy
        if ($skipped{$title} > 0) {
            print $text;
        }
        # full file didn't have the page. 
        elsif (!$wrote{$title}) {
            print $text;
        }
        # full file had the page and it was newer...
    }
}
close(FILE);

[] merge-deletes.pl

#!/usr/bin/perl
 
binmode(STDOUT, ":utf8");
binmode(STDIN, ":utf8");
 
use encoding(UTF8);
use utf8;
# παίρνουμε τις διαγραφές
# διαβάζουμε το παλιό αρχείο 
# όταν βρούμε σελίδα με τίτλο που δεν διαγράφτηκε αργότερα, την γράφουμε
 
unless (@ARGV) {
    die "Usage: $0 filename-of-deletions\n";
}
 
$filename=shift;
 
# hash of titles with timestamps from file...
open(FILE,'<',$filename) or die("can't open file $filename\n");
binmode(FILE, ":utf8");
while (<FILE>) {
    chomp;
    ($timestamp,$title) = split(/ /,$_,2);
    $titles{$title}=$timestamp;
}
close(FILE);
 
sub compareem {
    my($ts1,$ts2) = @_;
    $ts1 =~ s/[-:TZ]//g ;
    $ts2 =~ s/[-:TZ]//g ;
    return $ts1 <=> $ts2;
}
 
$text="";
while (<STDIN>) {
    $text.=$_;
    if (/<page>/) {
        $text = $_;
        $title="";
    }
    elsif (/<title>(.*)<\/title>/) {
        $title=$1;
    }
    elsif (/<timestamp>(.*)<\/timestamp>/) {
        $timestamp=$1;
    }
    elsif (/<\/page>/) {
        $result = compareem($titles{$title}, $timestamp);
        # compare our timestamp with the one from titles... if ours is later, we write it
        if ($result < 0) {
            print $text;
            if ($titles{$title}) {
                $wrote{$title}=1;
            }
        }
        else {
            $skipped{$title}=1;
        }
    }
}

[] config.txt

# configuration file for wiki snapshots

# change me to your project
#wiki="en.wiktionary.org"
wiki="el.wiktionary.org"

#change me to name of export page
expurl='Special:Export'
#expurl='Ειδικό:Export'

#change me to type of snapshot
#one of:   fullwithusers  full   titles
# fullwithusers means current copies of everything
# full means current copies of namespace 0
# titleswithusers means just the current titles of everything
# titles means just the current titles of namespace 0
snapshottype=fullwithusers

#how many seconds to sleep between log requests
logsecs=2

#how many seconds to sleep between requests of 500 pages
pagesecs=5

#work dir where all intermediate files will live
tmp="./tmp"

#name of file where snapshot will be stored
snapshot="last_full.xml"

#directory where snapshot will be stored
snapshotdir="."

#name of file where we will keep date of last run
lastrun="last_run"
Προσωπικά εργαλεία
Περιοχές ονομάτων

Παραλλαγές
Ενέργειες
πλοήγηση
συνεισφορά
βοήθεια
Εργαλειοθήκη