Χρήστης:AtouBot/updatefromtitles.sh

Σημείωση: αυτό το πρόγραμμα χρειάζεται τα αρχεία: sort.pl uniq.pl merge-pages-main-and-export.pl που θα βρείτε εδώ
#!/bin/bash
# TODO
#
# check for available disk space at beginning of run
# of any of the three parts. We will need at least size($lastfull) bytes
# plus (finding out in a minute)

usage() {
  echo "Usage: $0 xmlfile listoftitles [configfile]"
  echo "where xml file is the file of pages you want to update"
  echo "and listoftitles is a file containing the titles of"
  echo "the pages you want to retrieve for updating the"
  echo "xml file, one per line, either with [[ ]] or without."
  echo 
  echo "For example:"
  echo "$0 pages-to-edit.xml bot-titles.txt"
  echo
  echo "The optional configfile argument tells the script to use"
  echo "the config file you specify instead of the default config.txt"
  exit 1
}

if [ -z "$1"  ] || [ -z "$2" ]; then
  usage
fi

if [ ! -z "$3" ]; then
    if [ -e "$3" ]; then
	source "$3"
    else
	echo "Specified config file $3 does not exist."
	usage
    fi
else
    source ./config-updates.txt
fi


if [ ! -e "./sort.pl" ] || [ ! -e "./uniq.pl" ]; then
      echo "You are missing one or both of the files sort.pl or uniq.pl which should be in the same directory"
      echo "from which you run this command. Please put them in place and run this again."
      exit 1
fi

xmlfile="$1"
titlefile="$2"

ext=`date +%m-%d-%Y`

me=`basename $0`

mkdir -p $tmp
pages="$tmp/pages.$ext"
titles="$tmp/titles.$ext"
updated="$tmp/updated.$ext"

rm -f  $titles.* $pages.* 

# σελίδες με τους ορισμένους τίτλους
  cat $titlefile | ./sort.pl | ./uniq.pl | sed -e 's/\[\[//g; s/\]\]//g;' > $titles.txt

  count=1
  while [ 1 ]; do

    echo getting pages $count to $count+500

    # επόμενες 500
    tail -n +$count $titles.txt | head -n 500 > $titles.500.txt
    left=`cat $titles.500.txt | wc -l`
    if [ $left == "0" ]; then
        break;
    fi
    count=$(( $count+500 ))

    curl --retry 10 -f -F "curonly=1" -F "wpDownload=1" -F "pages=<$titles.500.txt" "http://$wiki/w/index.php?title=Special:Export&action=submit" > $pages.xml-temp

    if [ $? -ne 0 ]; then
        echo "Error $? from curl, unable to get xml pages, bailing"
        exit 1
    fi
    if [ -e "$pages.xml" ]; then
        mv $pages.xml $pages.xml-old
    fi
    # put it in front of the older batch, and back into the same filename 
    # (so most recent revs are at the beginning)
    if [ -e "$pages.xml-old" ]; then
        cat $pages.xml-temp $pages.xml-old > $pages.xml
    else
        cat $pages.xml-temp > $pages.xml
    fi
    sleep $pagesecs
done

# merge of the retrieved pages

if [ ! -e ./merge-pages-main-and-export.pl ]; then
    echo "The required script for this file, merge-pages-main-and-export.pl, is missing."
    echo "Please make sure that it is in the directory from where you are giving the"
    echo "command $0. "
    exit 1
fi

# lose those pages from the full xml dump if the pages in full are older. then cat the rest 
# (from the exports that are newer) on the end.
echo rewriting xml file
if [ -e "$xmlfile" ]; then
  cat "$xmlfile" | ./merge-pages-main-and-export.pl $pages.xml | bzip2 > $updated.xml.bz2
elif [ -e "$xmlfile.bz2" ]; then
  bzcat "$xmlfile.bz2" | ./merge-pages-main-and-export.pl $pages.xml | bzip2 >  $updated.xml.bz2
else
  echo "$xmlfile{.bz2} does not exist.  Please copy your file of pages for updating into this file"
  echo "and run this script again. You can either compress it as a bz2 file or leave it uncompressed."
  exit 1
fi

cp $updated.xml.bz2 updated.xml.bz2
echo new updated file is now in place at $updated.xml.bz2 and updated.xml.bz2

# done!
echo "done!"
exit 0
Χρήστης:AtouBot/updatefromtitles.sh

Μενού πλοήγησης

Αναζήτηση