Snip - extract a named element from an html file
#! /bin/bash printhelp () { echo "snip is a simple bash html cutter that works by extracting a specific element from an html file and feeding it to html2text. It presupposes wellformed html and that you know the kind of element you want and it's id. It depends on wget, grep, sed, cut and html2text. Syntax: snip <element type>#<element id> <file to parsed> Example: snip div#bodyContent /tmp/index.html " exit } quitter () { echo "Element id not found. Quitting."; exit } [ "$1" = "-h" -o "$1" = "--help" -o "$1" = "" ] && printhelp elementtype="$(echo $1 | cut -d '#' -f 1)" id="$(echo $1 | cut -d '#' -f 2)" htmlfile="$2" thebegin=$(grep -nioE "id=\"$id\"" $htmlfile | cut -d ':' -f 1) # echo $thebegin [ -n "$thebegin" ] || quitter i=0 element=0 sed -n $thebegin,\$p $htmlfile | while read line; do elementbegincount="$(echo $line | grep -io "<$elementtype" | grep -c .)" elementendcount="$(echo $line | grep -io "</$elementtype" | grep -c .)" element=$(($element+$elementbegincount-$elementendcount)) if [ "$element" -le 0 ]; then theend=$(($thebegin+$i)) # echo $theend sed -n $thebegin,${theend}p $htmlfile | html2text -style pretty exit fi let i++ done
As an example of how the script can be put to use, here's my Wikipedia lookup (the script above is referred to as 'snip' here):
#! /bin/bash useragent="Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071619 Firefox/3.0.1" if wget -q -U "$useragent" -O /tmp/wpfile "http://en.wikipedia.org/wiki/Special:Search?search=$*"; then clear echo "Page downloaded..." snip div#content /tmp/wpfile | less else echo "No connection, sorry. Please try again." fi