#!/bin/bash # getlinks--Given a URL, returns all of its relative and # absolute links. Has three options: -d to generate the primary # domains of every link, -i to list just those links that are # internal to the site (that is, other pages on the same site), and # -x to produce external links only (the opposite of –i). if [ $# -eq 0 ] ; then echo "Usage: $0 [-d|-i|-x] url" >&2 echo "-d=domains only, -i=internal refs only, -x=external only" >&2 exit 1 fi if [ $# -gt 1 ] ; then case "$1" in -d) lastcmd="cut -d/ -f3 | sort | uniq" shift ;; -r) basedomain="http://$(echo $2 | cut -d/ -f3)/" lastcmd="grep \"^$basedomain\" | sed \"s|$basedomain||g\" | sort |\ uniq" shift ;; -a) basedomain="http://$(echo $2 | cut -d/ -f3)/" lastcmd="grep -v \"^$basedomain\" | sort | uniq" shift ;; *) echo "$0: unknown option specified: $1" >&2; exit 1 esac else lastcmd="sort | uniq" fi lynx -dump "$1" | \ sed -n '/^References$/,$p' | \ grep -E '[[:digit:]]+\.' | \ awk '{print $2}' | \ cut -d\? -f1 | \ eval $lastcmd exit 0