#!/bin/sh # Generate an RSS 2.0 feed directly from your existing web site # Author - Pádraig Brady # Licence - LGPLV2 # Releases - # 1.0 - Jun 19 2006 - Initial release # 1.1 - Jun 26 2006 - Exclude files with "Exclude from bashfeed" # HTML comment within the first 10 lines. # 1.2 - May 01 2007 - Add author elements (from html if present) # 1.4 - May 04 2011 # http://github.com/pixelb/scripts/commits/master/scripts/bashfeed # Just run this script from the root directory of your web site and # it will generate feed items for the newest files. Generally I do this # just before I sync my local web site copy to my public server. # One can generate a feed for a subset of the site by still running # from the root directory, and passing a subdirectory to start at. # To change the feed TITLE and DESCRIPTION, set those environment # variables before running the script. # Which files are selected and excluded can be configured below. # Note for html files it will extract the following elements if present # # Item title # # # # Note this script will keep the same item guid for an updated file. # Just updating the pubDate will not cause liferea 1.0.11 at least # to mark the item as updated (or update the timestamp even). # One must change the description or title also, and so # I set the (hidden) description to the file timestamp. # # Testing with thunderbird 1.0.8 shows that it indexes on link # and so wont ever show updates to other fields. Therefore I append #seconds # to the link to force it to create a new entry for an updated item. # # Note you may find the http://www.pixelbeat.org/scripts/fix script # useful for doing edits to files that you don't want to show up # as updated content in the feed, or generally edit a file without # changing the modification date. num_files=10 site="www.pixelbeat.org" author="P@draigBrady.com (Pádraig Brady)" suggested_update_freq=1440 #mins : ${TITLE=$site} : ${DESCRIPTION="latest from $site"} #files starting with . | files without a . | files ending in .c .cpp ... include_re='(^|/)[.].+|(^|/)[^.]+$|[.](c|cpp|py|sh|rc|tips|fortune|html)$' #only show these files exclude_re='(\.git/|priv/|tmp/|.htaccess|xvpics|timeline\.html|modified\.html|head\.html|header\.html|footer\.html|footer-home\.html|adds\.html|last\.html|fslint/(NEWS\.html|md5sum)|README)' #don't show these paths default_files="index.html index.shtml index.php" ############# No user serviceable parts below ################### for file in $default_files; do replace_default_files="$replace_default_files; s/\(.*\)$file$/\1/;t" done echo ' ' time=`date --rfc-2822` echo " $TITLE $suggested_update_freq http://$site/$1 http://www.pixelbeat.org/scripts/bashfeed $DESCRIPTION $author $time `echo $LANG | sed 's/\(..\)_.*/\1/'` " xml_unescaped() { sed 's/&//g; s/&[lg]t;//g; s/"//g' | grep -q "[&<>]"; } find $1 -type f -printf "%p\t%T@\n" | sed 's/^\.\///' | # strip leading ./ when "$1" is empty sort -k2,2nr | cut -f1 | grep -E "$include_re" | grep -Ev "$exclude_re" | while read file; do if ! head "$file" | grep -Fiq ''; then echo "$file" i=$((i+1)) [ $i -eq $num_files ] && break fi done | while read file; do pubDate=`date --reference="$file" --rfc-2822` force_update=`date --reference="$file" "+%s"` title=""; keywords=""; description="" if echo "$file" | grep -Eq '\.(html|shtml|php)$'; then title=`sed -n 's/.*\(.*\)<\/title>.*/\1/ip;T;q' < "$file"` keywords=`sed -n 's/.*<META.*NAME="keywords".*CONTENT="\(.*\)".*/\1/ip;T;q' < "$file"` description=`sed -n 's/.*<META.*NAME="description".*CONTENT="\(.*\)".*/\1/ip;T;q' < "$file"` page_author=`sed -n 's/.*<META.*NAME="author".*CONTENT="\(.*\)".*/\1/ip;T;q' < "$file"` extracted_text="$title $keywords $description$ $page_author" if echo "$extracted_text" | xml_unescaped; then echo "Error: HTML metadata in $file will not produce a valid XML feed" >&2 exit 1 fi elif [ -x "$file" ]; then # I always have a 1 line description on line 3 of my scripts description=`sed -n '3s/# \(.*\)/\1/p' "$file"` fi file=`echo "$file" | sed "$replace_default_files"` [ -z "$title" ] && title="$file" tags="" if [ ! -z "$keywords" ]; then for keyword in $keywords; do tags=`echo -ne "$tags<category>$keyword</category>\n "` done fi if [ "$page_author" ]; then if [ "$page_author" = "$author" ]; then page_author="" else page_author=`echo -ne "<author>$page_author</author>\n "` fi fi echo " <item> <title>$title http://$site/$file $pubDate http://$site/$file#$force_update $page_author$tags]]> " done && echo ' '