#!/bin/bash # getlinks--Given a URL, returns all of its relative and # absolute links. Has three options: -d to generate the primary # domains of every link, -i to list just those links that are # internal to the site (that is, other pages on the same site), and # -x to produce external links only (the opposite of –i). if test $Argc -eq 0 { echo "Usage: $0 [-d|-i|-x] url" > !2 echo "-d=domains only, -i=internal refs only, -x=external only" > !2 exit 1 } if test $Argc -gt 1 { match $1 { with -d setglobal lastcmd = '"cut -d/ -f3 | sort | uniq'" shift with -r setglobal basedomain = ""http://$[echo $2 | cut -d/ -f3]/"" setglobal lastcmd = ""grep \"^$basedomain\" | sed \"s|$basedomain||g\" | sort |\ uniq"" shift with -a setglobal basedomain = ""http://$[echo $2 | cut -d/ -f3]/"" setglobal lastcmd = ""grep -v \"^$basedomain\" | sort | uniq"" shift with * echo "$0: unknown option specified: $1" > !2; exit 1 } } else { setglobal lastcmd = '"sort | uniq'" } lynx -dump $1 | \ sed -n '/^References$/,$p' | \ grep -E '[[:digit:]]+\.' | \ awk '{print $2}' | \ cut -d'?' -f1 | \ eval $lastcmd exit 0