EDIT: Added a quick check for the presence/absence of the element type in the line (before the grep operations) - greatly increases speed with large elements like #content on wikipedia.
#! /bin/bash printhelp () { echo "snip is a simple bash html cutter that works by extracting a specific element from an html file and feeding it to html2text. It presupposes wellformed html and that you know the kind of element you want and it's id. Syntax: snip <element type>#<element id> <file to parsed> Example: snip div#bodyContent /tmp/index.html " exit } quitter () { echo "Element id not found. Quitting."; exit } [ "$1" = "-h" -o "$1" = "--help" -o "$1" = "" ] && printhelp elementtype="$(echo $1 | cut -d '#' -f 1)" id="$(echo $1 | cut -d '#' -f 2)" htmlfile="$2" thebegin=$(grep -nioE "id=\"$id\"" $htmlfile | cut -d ':' -f 1) # echo $thebegin [ -n "$thebegin" ] || quitter sed -n ${thebegin}p "$htmlfile" | sed -re "s/^.*id=\"$id\"/<$elementtype id=\"$id\"/g" > /tmp/snipfile sed -n $(($thebegin+1)),\$p "$htmlfile" >> /tmp/snipfile i=0 element=0 cat /tmp/snipfile | while read line; do let i++ if [[ "$line" =~ "$elementtype" ]]; then elementbegincount="$(echo $line | grep -io "<$elementtype" | grep -c .)" elementendcount="$(echo $line | grep -io "</$elementtype" | grep -c .)" element=$(($element+$elementbegincount-$elementendcount)) if [ "$element" -le 0 ]; then sed -n 1,${i}p /tmp/snipfile | html2text exit fi fi done
As an example of how the script can be put to use, here's my Wikipedia lookup (the script above is referred to as 'snip' here):
#! /bin/bash useragent="Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071619 Firefox/3.0.1" if wget -q -U "$useragent" -O /tmp/wpfile "http://en.wikipedia.org/wiki/Special:Search?search=$*"; then clear echo "Page downloaded..." snip div#content /tmp/wpfile | less else echo "No connection, sorry. Please try again." fi