Χρήστης:AtouBot/getrcs.sh

Από Βικιλεξικό
Μετάβαση στην πλοήγηση Πήδηση στην αναζήτηση

Δείτε επίσης: Χρήστης:AtouBot/getrcs.sh/docs


getrcs.sh[επεξεργασία]

#!/bin/bash
# TODO
#
# check for available disk space at beginning of run
# of any of the three parts. We will need at least size($lastfull) bytes
# plus (finding out in a minute)

usage() {
  echo "Usage: $0 startdate  endate [configfile]"
  echo "where startdate is latest date from which to get changes"
  echo "and enddate is the earliest date, in the local timezone."
  echo "The base date may be specified as either today, or lastrun,"
  echo "where lastrun is the latest date you got changes from"
  echo "during the previous run."
  echo 
  echo "For example:"
  echo "$0 today today-3d"
  echo "$0 today-1h today-5h"
  echo "$0 today lastrun"
  echo "If you omit the d or h the increment is interpreted as days"
  echo
  echo "Alternatively you can specify absolute timestamps."
  echo "These must be in the format yyyy-mm-ddThh:mm:ssZ"
  echo "For example:"
  echo "$0 2008-02-06T08:54:06Z  2008-01-23T08:00:00Z"
  echo "In this case the times are interpreted as UTC times."
  echo
  echo "The optional configfile argument tells the script to use"
  echo "the config file you specify instead of the default config.txt"
  exit 1
}

if [ -z "$1"  ] || [ -z "$2" ]; then
  usage
fi

if [ ! -z "$3" ]; then
    if [ -e "$3" ]; then
	source "$3"
    else
	echo "Specified config file $3 does not exist."
	usage
    fi
else
    source ./config.txt
fi


if [ ! -e "./sort.pl" ] || [ ! -e "./uniq.pl" ]; then
      echo "You are missing one or both of the files sort.pl or uniq.pl which should be in the same directory"
      echo "from which you run this command. Please put them in place and run this again."
      exit 1
fi

usage_lastrun() {

    echo "In order to use lastrun+-(d|h), you need to have the timestamp of the last run"
    echo "stored in the file $lastrun in the current directory.  To get the appropriate"
    echo "timestamp, run"
    echo 'date +%s -d "yyyy-mm-dd hh:mm:ss +0000" > $lastrun' 
    echo "Then run this script again."
    exit 1
}

checkformat() {
    local d

    d="$1"

    if [ -z "$d" ]; then
	secs=`date +%s`
        return $secs
    fi	

    hasZ=`echo $1 | grep Z`
    if [ ! -z "$hasZ" ]; then
       # μορφή ως: 2008-01-23T08:00:00Z
       # μετατροπή σε: 2008-01-23 08:00:00 +0000
       reformatted=`echo $1 | sed -e 's/T/ /; s/Z/ +0000/;'`
       secs=`date --date="$reformatted" +%s`
       return $secs
    fi

    minus=`echo "$d" | grep -e '-'`
    plus=`echo "$d" | grep -e '+'`
    if [ ! -z "$minus" ]; then 
	op="-"
    elif [ ! -z "$plus" ]; then
	op="+"
    else
	op=""
    fi
    if [ -z "$op" ]; then
	basedate=$d
	incr=0
	incrtype="d"
    else
        basedate=`echo $d | awk -F"$op" '{ print $1 }'`
        incr=`echo $d | awk -F"$op" '{ print $2 }'`
	incrtype="d"
    fi
    if [ ! -z "$incr" ]; then
	day=`echo "$incr" | grep 'd'`
	hour=`echo "$incr" | grep 'h'`
	if [ ! -z "$day" ]; then 
	    incrtype="d"
	elif [ ! -z "$hour" ]; then
	    incrtype="h"
	fi
	incr=`echo $incr | sed -e "s/$incrtype//"`
	if [ -z "$incr" ]; then
	    incr='0'
	fi
    fi
    case $basedate in
	'today')
            today=`date -u +"%Y-%m-%d %H:%M:%S +0000"`
	    secs=`date +%s -d "$today"`
	    ;;
	'lastrun')
	    if [ ! -e "$lastrun" ]; then
		usage_lastrun
		exit 1
	    fi
	    lastdaterun=`cat $lastrun`
	    testdate=`date -d @"$lastdaterun"`
	    if [ $? -ne 0 ]; then
		usage_lastrun
	    fi
	    secs=`date +%s -d @"$lastdaterun"`
	    ;;
	*)
	    usage
	    ;;
    esac
    case $incrtype in
	'd')
	    incr=$(( $incr*86400 ))
	;;
	'h')
	    incr=$(( $incr*3600 ))
	;;
	*)
	;;
    esac
    case $op in 
	'-')
	    secs=$(( $secs-$incr ))
	    ;;
	'+')
	    secs=$(( $secs+$incr ))
	    ;;
	'')
	    ;;
	*)
	    usage
    esac
    return 0
}

checkformat "$1"
startdatesecs=$secs
checkformat "$2"
enddatesecs=$secs

ext=`date +%m-%d-%Y -d @$startdatesecs`

globstartdate=`date -u -d @$startdatesecs +"%Y-%m-%dT%H:%M:%SZ"`
globenddate=`date -u -d @$enddatesecs +"%Y-%m-%dT%H:%M:%SZ"`

lastdaterun="$startdatesecs"
me=`basename $0`

mkdir -p $tmp
changes="$tmp/changes.$ext"
moves="$tmp/moves.$ext"
imports="$tmp/imports.$ext"
uploads="$tmp/uploads.$ext"
deletes="$tmp/deletes.$ext"
pages="$tmp/pages.$ext"
titles="$tmp/titles.$ext"
full="$tmp/full.$ext"
lastfull="$snapshotdir/$snapshot"

case $me in 
    'getchanges.sh')
	do="changes"
	;;
    'getmoves.sh')
	do="moves"
	;;
    'getimports.sh')
	do="imports"
	;;
    'getuploads.sh')
	do="uploads"
	;;
    'getdeletes.sh')
	do="deletes"
	;;
    'getpages.sh')
	do="pages"
        ;;
    'domerges.sh')
	do="merges"
	;;
    *)
	rm -f  $titles.* $pages.* $deletes.* $moves.* $imports.* $uploads.* $changes.* 
	do="all"
	;;
esac
if [ "$do" != "all" ]; then
    rm -f "$tmp/$do".*
fi

# πρόσφατες αλλαγές
url1a="https://${wiki}/w/api.php?action=query&list=recentchanges&continue=&rclimit=500&rctype=new|edit&format=xml&rcstart=${globstartdate}&rcend=${globenddate}"
url2a="https://${wiki}/w/api.php?action=query&list=recentchanges&continue=&rclimit=500&rctype=new|edit&format=xml&rcstart=${globstartdate}&rcend=${globenddate}&rcnamespace=0"
url1=$url1a
url2=$url2a

echo "getting recent changes from $globstartdate to $globenddate"

while [ 1 ]; do
  if [ "$do" != "changes" ] && [ "$do" != "all" ]; then
    break;
  fi    

  # παίρνουμε τις επόμενες γραμμές από την καταγραφή πρόσφατων αλλαγών
  case "$snapshottype" in 
      "fullwithusers" | "titleswithusers" )
	  curl --retry 10 -H "Expect:" -f $url1  >  $changes.raw
	  ;;
      "full" | "titles")
	  curl --retry 10 -H "Expect:" -f $url2  >  $changes.raw
	  ;;
      *)
	  echo "Unknown snapshot type. Please check your configuration file and"
          echo "run this step again."
	  exit 1
  esac

  if [ $? -ne 0 ]; then
      echo "Error $? from curl, unable to get recent changes, bailing"
      exit 1
  fi
  #if [ -e "$changes.cmp" ]; then
  #    aredone=`cmp $changes.raw $changes.cmp`
  #    if [ -z "$aredone" ]; then
#	  break;
   #   fi
  #fi
  #cp $changes.raw $changes.cmp
  cat $changes.raw >> $changes.raw.save

  # παίρνουμε τους τίτλους
  case "$snapshottype" in 
      "fullwithusers" | "full")
      cat  $changes.raw | sed -e 's/>/>\n/g;' | grep '<rc type=' | awk -F\" '{ print $6 }'  >> $titles.txt
      ;;
      "titleswithusers" | "titles" | *)
      cat  $changes.raw | sed -e 's/>/>\n/g;' | grep '<rc type=' | awk -F\" '{ print $16 " " $6 }'  >> $titles.txt
      ;;
  esac

  # παίρνουμε τη χρονοσφραγίδα από την γραμμή rccontinue
  nextstartdate=`cat $changes.raw |   sed -e 's/>/>\n/g;' | grep 'rccontinue=' | awk -F\" '{ print $2 }'`
  # αν είναι κενό... τελειώσαμε
  if [[ -z "$nextstartdate" ]]; then
    break
  fi

  url1="${url1a}&rccontinue=${nextstartdate}"
  url2="${url2a}&rccontinue=${nextstartdate}"
  sleep $logsecs
done

#moves
url1a="https://${wiki}/w/api.php?action=query&list=logevents&letype=move&continue=&lelimit=500&format=xml&lestart=${globstartdate}&leend=${globenddate}"
url1=$url1a

echo "getting moves from $globstartdate to $globenddate"
while [ 1 ]; do
    if [ "$do" != "moves" ] && [ "$do" != "all" ]; then
	break;
    fi

    # παίρνουμε τις επόμενες γραμμές από την καταγραφή μετακινήσεων
    curl --retry 10 -H "Expect:" -f $url1  >  $moves.raw
    if [ $? -ne 0 ]; then
	echo "Error $? from curl, unable to get moves, bailing"
        exit 1
    fi

    #if [ -e "$moves.cmp" ]; then
    #    aredone=`cmp $moves.raw $moves.cmp`
    #    if [ -z "$aredone" ]; then
#	    break;
    #    fi
    #fi
    #cp $moves.raw $moves.cmp
    cat $moves.raw >> $moves.raw.save

    # παίρνουμε τους τίτλους
# μόνο για το full έχουμε ελέγξει τα αποτελέσματα, τα υπόλοιπα δεν έχουν δοκιμαστεί
    case "$snapshottype" in 
      "fullwithusers")
	  cat  $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $6 }'   >> $titles.txt
	  cat  $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $24 }'  >> $titles.txt
	  ;;
      "full")
	  cat  $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $24 }' | grep -v ':'  >> $titles.txt
	  cat  $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | grep -v 'suppressredirect' | awk -F\" '{ print $6 }' | grep -v ':'  >> $titles.txt
	  cat  $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | grep 'suppressredirect' | grep 'ns="0"' |  awk -F\" '{ print $18 " " $6 }'  >> $deletes.xml
	  ;;
      "titleswithusers")
	  cat  $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $8 }'   >> $titles.txt
	  cat  $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $22 }'  >> $titles.txt
	  ;;
      "titles")
	  cat  $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $8 }' | grep -v ':'  >> $titles.txt
	  cat  $moves.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $16 " " $22 }' | grep -v ':'  >> $titles.txt
	  ;;
      *)
	  ;;
    esac
    # παίρνουμε τη χρονοσφραγίδα
    nextstartdate=`cat $moves.raw |   sed -e 's/>/>\n/g;' | grep 'lecontinue=' | awk -F\" '{ print $2 }'`
    # αν είναι κενό... τελειώσαμε
    if [[ -z "$nextstartdate" ]]; then
        break 
    fi	

    url1="${url1a}&lecontinue=${nextstartdate}"
    sleep $logsecs
done

# εισαγωγές
url1a="https://${wiki}/w/api.php?action=query&list=logevents&letype=import&continue=&lelimit=500&format=xml&lestart=${globstartdate}&leend=${globenddate}"
url1=$url1a

echo "getting moves from $globstartdate to $globenddate"

while [ 1 ]; do
    if [ "$do" != "imports" ] && [ "$do" != "all" ]; then
	break;
    fi

    # παίρνουμε τις επόμενες γραμμές από την καταγραφή εισαγωγών
    curl --retry 10 -H "Expect:" -f $url1  >  $imports.raw
    if [ $? -ne 0 ]; then
	echo "Error $? from curl, unable to get imports, bailing"
        exit 1
    fi
    #if [ -e "$imports.cmp" ]; then
    #    aredone=`cmp $imports.raw $imports.cmp`
    #    if [ -z "$aredone" ]; then
	#    break;
    #    fi
    #fi
    #cp $imports.raw $imports.cmp
    cat $imports.raw >>  $imports.raw.save

    # παίρνουμε τους τίτλους
    case "$snapshottype" in 
      "fullwithusers")
	    cat  $imports.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $6 }'   >> $titles.txt
	    ;;
	"full")
	    cat  $imports.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $6 }' | grep -v ':'  >> $titles.txt
	    ;;
      "titleswithusers")
	    cat  $imports.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $18 " " $6 }'   >> $titles.txt
	    ;;
	"titles")
	    cat  $imports.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $18 " " $6 }' | grep -v ':'  >> $titles.txt
	    ;;
	*)
	    ;;
    esac
	    
    # παίρνουμε το χρονοσφραγίδα από την τελευταία γραμμή
    nextstartdate=`cat $imports.raw |   sed -e 's/>/>\n/g;' | grep 'lecontinue=' | awk -F\" '{ print $2 }'`
    # αν είναι κενό... τελειώσαμε
    if [[ -z "$nextstartdate" ]]; then
        break 
    fi	
    url1="${url1a}&lecontinue=${nextstartdate}"
    sleep $logsecs
done

# επιφορτώσεις
url1a="https://${wiki}/w/api.php?action=query&list=logevents&letype=upload&continue=&lelimit=500&format=xml&lestart=${globstartdate}&leend=${globenddate}"
url1=$url1a

echo getting uploads 
while [ 1 ]; do

    if [ "$do" != "uploads" ] && [ "$do" != "all" ]; then
	break;
    fi

    # παίρνουμε τις επόμενες γραμμές από την καταγραφή επιφορτώσεων
    curl --retry 10 -H "Expect:" -f $url1  >  $uploads.raw
    if [ $? -ne 0 ]; then
	echo "Error $? from curl, unable to get uploads, bailing"
        exit 1
    fi
    #if [ -e "$uploads.cmp" ]; then
    #    aredone=`cmp $uploads.raw $uploads.cmp`
    #    if [ -z "$aredone" ]; then
	#    break;
    #    fi
    #fi
    #cp $uploads.raw $uploads.cmp
    cat $uploads.raw >>  $uploads.raw.save

    # παίρνουμε τους τίτλους
    case "$snapshottype" in 
	"fullwithusers")
	    cat  $uploads.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $6 }'  >> $titles.txt
	    ;;
	"full")
	    cat  $uploads.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $6 }'| grep -v ':'  >> $titles.txt
	    ;;
	"titleswithusers")
	    cat  $uploads.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $18 " " $6 }'  >> $titles.txt
	    ;;
	"titles")
	    cat  $uploads.raw | sed -e 's/\/>/\/>\n/g; s/<item/\n<item/g;' | grep '<item logid' | awk -F\" '{ print $18 " " $6 }'| grep -v ':'  >> $titles.txt
	    ;;
	*)
	    ;;
    esac
    # παίρνουμε τη χρονοσφραγίδα
    nextstartdate=`cat $uploads.raw | sed -e 's/>/>\n/g;' | grep 'lecontinue=' | awk -F\" '{ print $2 }'`
    # αν είναι κενό... τελειώσαμε
    if [[ -z "$nextstartdate" ]]; then
        break 
    fi	
    url1="${url1a}&lecontinue=${nextstartdate}"
    sleep $logsecs
done

if [ "$snapshottype" == "fullwithusers" ] || [ "$snapshottype" == "full" ]; then

  # σελίδες με τους ορισμένους τίτλους
    mv $titles.txt $titles.txt-temp
    cat $titles.txt-temp | ./sort.pl | ./uniq.pl > $titles.txt

    count=1
    while [ 1 ]; do
	
	if [ "$do" != "pages" ] && [ "$do" != "all" ]; then
	    break;
	fi
  
	echo getting pages $count to $count+500

        # επόμενες 500
	tail -n +$count $titles.txt | head -n 500 > $titles.500.txt
	left=`cat $titles.500.txt | wc -l`
	if [ $left == "0" ]; then
	    break;
	fi
	count=$(( $count+500 ))
	
	curl --retry 10 -H "Expect:" -f -F "curonly=1" -F "wpDownload=1" -F "pages=<$titles.500.txt" "https://$wiki/w/index.php?title=Special:Export&action=submit" > $pages.xml-temp

	if [ $? -ne 0 ]; then
	    echo "Error $? from curl, unable to get xml pages, bailing"
	    exit 1
	fi
	if [ -e "$pages.xml" ]; then
	    mv $pages.xml $pages.xml-old
	fi
        # put it in front of the older batch, and back into the same filename 
        # (so most recent revs are at the beginning)
	if [ -e "$pages.xml-old" ]; then
	    cat $pages.xml-temp $pages.xml-old > $pages.xml
	else
	    cat $pages.xml-temp > $pages.xml
	fi
	sleep $pagesecs
    done

fi

# διαγραφές
url1a="https://${wiki}/w/api.php?action=query&list=logevents&letype=delete&continue=&lelimit=500&format=xml&lestart=${globstartdate}&leend=${globenddate}"
url1=$url1a
echo getting deletes

while [ 1 ]; do

    if [ "$do" != "deletes" ] && [ "$do" != "all" ]; then
	break;
    fi

    # get next lines from delete log
    curl --retry 10 -H "Expect:" -f $url1  >  $deletes.raw  
    if [ $? -ne 0 ]; then
	echo "Error $? from curl, unable to get deletes, bailing"
        exit 1
    fi
    #if [ -e "$deletes.cmp" ]; then
    #    aredone=`cmp $deletes.raw $deletes.cmp`
    #    if [ -z "$aredone" ]; then
	#    break;
    #    fi
    #fi
    #cp $deletes.raw $deletes.cmp
    cat $deletes.raw >>  $deletes.raw.save

    # create new batch of timestamp, title for each delete record
    # we don't bother to filter these based on snapshot type
    cat  $deletes.raw | sed -e 's/>/>\n/g;' | grep '<item logid' | grep -v 'action="revision"' | awk -F\" '{ print $18 " " $6 }' | sed "s/&#039;/'/g" >> $deletes.xml

    # παίρνουμε τη χρονοσφραγίδα από την τελευταία γραμμή
    nextstartdate=`cat $deletes.raw | sed -e 's/>/>\n/g;' | grep 'lecontinue=' | awk -F\" '{ print $2 }'`
    # αν είναι κενό... τελειώσαμε
    if [[ -z "$nextstartdate" ]]; then
        break 
    fi	

    url1="${url1a}&lecontinue=${nextstartdate}"
    sleep $logsecs
done

if [ "$do" != "merges" ] && [ "$do" != "all" ]; then
  echo "done!"
  exit 0;
fi

# merges of new pages, changed pages, and deletes

if [ ! -e ./merge-pages-main-and-export.pl ] || [ ! -e ./merge-deletes.pl ]; then
    echo "One or more of the required scripts for this file are missing:"
    echo "merge-pages-main-and-export.pl  or  merge-deletes.pl."
    echo "Please make sure that they are all in the directory from where you are giving the"
    echo "command $0. "
    exit 1
fi

if [ "$snapshottype" == "titleswithusers" ] || [ "$snapshottype" == "titles" ]; then

    if [ ! -e "$lastfull" ] && [ ! -e "$lastfull.bz2" ]; then
	echo "$lastfull{.bz2} does not exist.  Please copy your last full incremental into this file"
	echo "and run this script again as   domerges.sh $1 $2 $3 in order to finish this last step."
	echo "You can either compress it as a bz2 file or leave it uncompressed."
	exit 1
    fi
    if [ -e "$lastfull.bz2" ]; then 
       compressed="true"
    fi

    # full xml files have this in their first line
    if [ -z "$compressed" ]; then
	isxml=`head -1 $lastfull | egrep '<page|<mediawiki'`
    else 
	isxml=`bzcat $lastfull.bz2 | head -1 | egrep '<page|<mediawiki'`
    fi
    if [ ! -z "$isxml" ]; then
	echo "generating titles from standard xml file..."
        # we must get the ts and title from the xml file and stuff it somewhere. 
	if [ -z "$compressed" ]; then
	    cat "$lastfull" | ./full2titles.pl > "$lastfull.titles"
	    mv "$lastfull" "$lastfull.sav"
	else	
	    bzcat "$lastfull" | ./full2titles.pl > "$lastfull.titles"
	    mv "$lastfull.bz2" "$lastfull.bz2.sav"
	fi
	mv "$lastfull.titles" "$lastfull"
	compressed=""
    fi

    # lose those pages from the full xml dump if the pages in full are older. then cat the rest 
    # (from the exports that are newer) on the end.
    echo rewriting full titles list 
    if [ -z "$compressed" ]; then
	cat "$lastfull" | ./merge-pages-main-and-export-titles.pl $titles.txt > $full-titles.xml-temp
    else
	cat "$lastfull.bz2" | ./merge-pages-main-and-export-titles.pl $titles.txt > $full-titles.xml-temp
    fi
    # process the deletes
    echo processing deletes
    cat $full-titles.xml-temp | ./merge-deletes-titles.pl $deletes.xml > $full-titles.xml
    echo copying files into place
    # set up new full to be the next file we use
    cp $full-titles.xml $lastfull
    # don't do this til the end, in case of failure
    echo "$lastdaterun" > "$lastrun"
    cp $full-titles.xml full-titles.$ext.xml
    echo new full titles list is now in place at full-titles.$ext.xml and $lastfull

else

    # lose those pages from the full xml dump if the pages in full are older. then cat the rest 
    # (from the exports that are newer) on the end.
    echo rewriting full dump 
    if [ -e "$lastfull" ]; then
	cat "$lastfull" | ./merge-pages-main-and-export.pl $pages.xml | bzip2 > $full.xml.bz2-temp
    elif [ -e "$lastfull.bz2" ]; then
	bzcat "$lastfull.bz2" | ./merge-pages-main-and-export.pl $pages.xml | bzip2 >  $full.xml.bz2-temp
    else
	echo ""$lastfull{.bz2}" does not exist.  Please copy your last full incremental into this file"
	echo "and run this script again as   domerges.sh $1 $2 $3 in order to finish this last step."
	echo "You can copy the uncompressed xml file or you can copy it as a bz2 file."
	exit 1
    fi

    # process the deletes
    echo processing deletes
    bzcat $full.xml.bz2-temp | ./merge-deletes.pl $deletes.xml | bzip2 > $full.xml.bz2
    echo copying files into place
    # set up new full to be the next file we use
    cp $full.xml.bz2 $lastfull.bz2
    # don't do this til the end, in case of failure
    echo "$lastdaterun" > last_run
    cp $full.xml.bz2 full.$ext.xml.bz2
    echo new full is now in place at full.$ext.xml.bz2 and $lastfull.bz2

fi

# done!
echo "done!"
exit 0

sort.pl[επεξεργασία]

#!/usr/bin/perl

binmode(STDOUT, ":utf8");
binmode(STDIN, ":utf8");

use encoding(UTF8);
use utf8;

print foreach sort <STDIN>;

uniq.pl[επεξεργασία]

#!/usr/bin/perl

binmode(STDOUT, ":utf8");
binmode(STDIN, ":utf8");

use encoding(UTF8);
use utf8;

$prevline="";
while (<STDIN>) { 
    if ($_ ne $prevline) {
        print;
        $prevline=$_;
    }
}

merge-pages-main-and-export.pl[επεξεργασία]

#!/usr/bin/perl

binmode(STDOUT, ":utf8");
binmode(STDIN, ":utf8");

use encoding(UTF8);
use utf8;
# παίρνουμε τους καινούργιους τίτλους
# διαβάζουμε το παλιό αρχείο 
# όταν βρούμε σελίδα με τίτλο που δεν είναι στον κατάλογο, την γράφουμε

unless (@ARGV) {
    die "Usage: $0 filename-of-exported-pages\n";
}

$filename=shift;

# hash of titles with timestamps from file...
open(FILE,'<',$filename);
binmode(FILE, ":utf8");
while (<FILE>) {
    if (/<title>(.*)<\/title>/) {
	$temptitle=$1;
    }
    elsif (/<timestamp>(.*)<\/timestamp>/) {
	$temptimestamp=$1;
	$titles{$temptitle}=$temptimestamp;
    }
}
close(FILE);

sub compareem {
    my($ts1,$ts2) = @_;
    $ts1 =~ s/[-:TZ]//g ;
    $ts2 =~ s/[-:TZ]//g ;
    return $ts1 <=> $ts2;
}

$text="";
while (<STDIN>) {
    $text.=$_;
    if (/<page>/) {
	$text = $_;
	$title="";
    }
    elsif (/<title>(.*)<\/title>/) {
	$title=$1;
    }
    elsif (/<timestamp>(.*)<\/timestamp>/) {
	$timestamp=$1;
    }
    elsif (/<\/page>/) {
	$result = compareem($titles{$title}, $timestamp);
        # compare our timestamp with the one from titles... if ours is later, we write it
	if ($result < 0) {
	    print $text;
	    if ($titles{$title}) {
		$wrote{$title}=1;
	    }
	}
	else {
	    $skipped{$title}=1;
	}
    }
}

# reopen file, we are going to read the stuff from it and skip the titles that we
# wrote already but write the rest
open(FILE,'<',$filename);
binmode(FILE, ":utf8");
while (<FILE>) {
    $text.=$_;
    if (/<page>/) {
	$text = $_;
	$title="";
    }
    elsif (/<title>(.*)<\/title>/) {
	$title=$1;
    }
    elsif (/<\/page>/) {
	# full file had the page, but it was an older copy
	if ($skipped{$title} > 0) {
	    print $text;
	}
	# full file didn't have the page. 
	elsif (!$wrote{$title}) {
	    print $text;
	}
	# full file had the page and it was newer...
    }
}
close(FILE);

merge-deletes.pl[επεξεργασία]

#!/usr/bin/perl

binmode(STDOUT, ":utf8");
binmode(STDIN, ":utf8");

use encoding(UTF8);
use utf8;
# παίρνουμε τις διαγραφές
# διαβάζουμε το παλιό αρχείο 
# όταν βρούμε σελίδα με τίτλο που δεν διαγράφτηκε αργότερα, την γράφουμε

unless (@ARGV) {
    die "Usage: $0 filename-of-deletions\n";
}

$filename=shift;

# hash of titles with timestamps from file...
open(FILE,'<',$filename) or die("can't open file $filename\n");
binmode(FILE, ":utf8");
while (<FILE>) {
    chomp;
    ($timestamp,$title) = split(/ /,$_,2);
    $titles{$title}=$timestamp;
}
close(FILE);

sub compareem {
    my($ts1,$ts2) = @_;
    $ts1 =~ s/[-:TZ]//g ;
    $ts2 =~ s/[-:TZ]//g ;
    return $ts1 <=> $ts2;
}

$text="";
while (<STDIN>) {
    $text.=$_;
    if (/<page>/) {
	$text = $_;
	$title="";
    }
    elsif (/<title>(.*)<\/title>/) {
	$title=$1;
    }
    elsif (/<timestamp>(.*)<\/timestamp>/) {
	$timestamp=$1;
    }
    elsif (/<\/page>/) {
	$result = compareem($titles{$title}, $timestamp);
        # compare our timestamp with the one from titles... if ours is later, we write it
	if ($result < 0) {
	    print $text;
	    if ($titles{$title}) {
		$wrote{$title}=1;
	    }
	}
	else {
	    $skipped{$title}=1;
	}
    }
}

config.txt[επεξεργασία]

# configuration file for wiki snapshots

# change me to your project
#wiki="en.wiktionary.org"
wiki="el.wiktionary.org"

#change me to name of export page
expurl='Special:Export'
#expurl='Ειδικό:Export'

#change me to type of snapshot
#one of:   fullwithusers  full   titles
# fullwithusers means current copies of everything
# full means current copies of namespace 0
# titleswithusers means just the current titles of everything
# titles means just the current titles of namespace 0
snapshottype=fullwithusers

#how many seconds to sleep between log requests
logsecs=2

#how many seconds to sleep between requests of 500 pages
pagesecs=5

#work dir where all intermediate files will live
tmp="./tmp"

#name of file where snapshot will be stored
snapshot="last_full.xml"

#directory where snapshot will be stored
snapshotdir="."

#name of file where we will keep date of last run
lastrun="last_run"