#!/bin/sh # simple spider shell script to check Zooland et al. for consistency # $Id: spid,v 1.234 2001/08/22 10:56:53 joke Rel $ echo=/usr/local/bin/echo # if [ ! "$1" ] ; then $echo "usage: spid "; exit 1 fi # wget -t 1 -T 5 -O - $1 2>spid.log | sed -e 's/HREF=/href=/g' >spid.index # code=`echo $1 |grep .htm` if [ "$code" != "" ] ; then baseurl=`dirname $1` else baseurl=$1 fi # egrep -e 'href=.*(http|https):' spid.index \ | tr -d '"' | tr -d "'" \ | sed -e 's,^.*href=http,http,g;s, *>.*$,,g' \ > /tmp/spid$$ # egrep -e 'href=.*ftp:' spid.index \ | tr -d '"' | tr -d "'" \ | sed -e 's,^.*href=ftp,ftp,g;s, *>.*$,,g' \ >> /tmp/spid$$ # egrep -v -e '(href=.*(http|https|ftp|mailto|news):|#)' spid.index \ | egrep -e 'href=' \ | tr -d '"' | tr -d "'" \ | sed -e "s,^.*href= *,$baseurl/,g;s, *>.*$,,g" \ >> /tmp/spid$$ # #cat /tmp/spid$$ #exit 0 urls="`cat /tmp/spid$$`" # rm -f /tmp/spid$$ spid.index cnt=0 # for i in $urls ; do $echo -n "spid: $i appears to be " wget -t 1 -T 5 -S --spider $i >>spid.log 2>&1 code="`tail -2 spid.log | grep '200 OK'`" if [ "$code" != "" ] ; then date="`tail -10 spid.log | grep -i modified | sed -e 's/^.*: //'`" if [ "$date" != "" ] ; then echo "OK (dated $date)" else echo "OK" fi else cnt=`expr $cnt + 1` code="`tail -10 spid.log | grep 'HTTP request' | sed -e 's/^.*awaiting response... //'`" if [ "$code" != "" ] ; then echo "BROKE (error $code)" else echo "BROKE (temporarily unavailable)" fi fi done # echo "--" # if [ $cnt -gt 0 ] ; then echo "spid: $cnt suspicious links in '$1'" fi echo "spid: see file 'spid.log' for complete server/error messages. -- " # exit 0