Batch download snippets from http://codesnippets.joyent.com and convert them to text files using man textutil (available on Mac OS X 10.4 or later).
Note: Old snippet versions will be automatically replaced by the downloaded snippets without a backup!
Author: jv
License:
The MIT License, Copyright (c) 2008 jv
Usage:
bds vim
bds -p 1280
bds -u jvs
bds -t plistbuddy
bds -t tar
bds -t ipfw -u jvs
declare BaseURL='http://codesnippets.joyent.com'
declare download_dir="${HOME}/Desktop/Snippets"
BaseURL="${BaseURL%/}"
download_dir="${download_dir%/}"
declare BasePostURL="${BaseURL}/posts/show"
declare BaseTagURL="${BaseURL}/tag"
declare BaseUserURL="${BaseURL}/user"
BasePostURL="${BasePostURL%/}"
BaseTagURL="${BaseTagURL%/}"
BaseUserURL="${BaseUserURL%/}"
declare InputEncoding='utf-8'
declare OutputEncoding='utf-8'
export IFS=$' \t\n'
# function to download a single post specified by a post number: bds -p num
# cf. snippet, http://codesnippets.joyent.com/posts/show/1282
function snippet() {
declare NL OPWD file outputfile postnum title url
if [[ "${1//[[:digit:]]/}" != "" ]]; then echo "Argument error. No positive integer: ${1}"; return 1; fi
postnum="${1}"
url="${BasePostURL}/${postnum}"
download_dir="${download_dir}/single-downloads"
/bin/mkdir -p "${download_dir}"
OPWD="${PWD}"
cd "${download_dir}"
/usr/bin/curl -L -O -s --max-time 25 "${url}" || exit 1 # download snippet web page
file="${download_dir}/${url##*/}"
trap '/bin/rm -f "${file}"; exit 0' 0 1 2 13 15
# get title of downloaded web page
#title="$(/usr/bin/sed -E -n -e '/<[tT][iI][tT][lL][eE]>/{s/^.*<[tT][iI][tT][lL][eE]>(.*)<\/[tT][iI][tT][lL][eE]>.*$/\1/p;q;}' "${file}" | \
# /usr/bin/sed -E -e 's/\[[^][:space:]]*\]//g')" # delete [xxx] tag elements of title
title="$(/usr/bin/egrep -m 1 -io '<title>.*</title>' "${file}" | /usr/bin/sed -E -e 's/^<title>[[:space:]]*|[[:space:]]*<\/title>$//g' \
-e 's/\[[^][:space:]]*\]//g')" # delete [xxx] tag elements of title
title="${title//CodeSnippets:/}"
title="${title//\//:}"
title="${title// /_}"
title="${title//[[:cntrl:]]/}"
title="${title%"${title##*[!_]}"}" # remove trailing underscores
if [[ $title == '_CodeDrive_Snippets_courtesy_of_Peter_Coopers_handy_little_app' ]] || [[ -z "$title" ]]; then
printf "\e[0K\e[31m%s\e[0m: %s\n" "couldn't access" "${url}"
/bin/rm "${file}"
return 1
fi
outputfile="${download_dir}/${postnum}_${title}.txt"
#outputfile="${download_dir}/${title}.txt" # without post number prefix
#outputfile="${outputfile//__/_}" # uniq underscores
printf "\n\e[0K\e[1;30m%s\e[0m: %s\n\n" "saved as" "${outputfile}"
/usr/bin/textutil -output "${outputfile}" -convert txt -inputencoding "${InputEncoding}" -encoding "${OutputEncoding}" "${file}"
/bin/rm "${file}"
# escape backslashes
# man bash 2>/dev/null | less -p 'Each command in a pipeline'
#outputfile="$(printf "%q" "${outputfile}")" # cf. help printf
outputfile="${outputfile//\\/\\\\}"
NL=$'\\\n'
cat <<EOF | /bin/ed -s "${outputfile}"
H
,g/Snippets is a public source code repository/1,/Snippets is a public source code repository/d
,g/You need to create an account or log in to post comments to this site//You need to create an account or log in to post comments to this site/,\$d
,g|(See related posts)$|s|.See related posts.|${NL}${NL}|
,g|^to.* by.* on .*[[:digit:]]$|s|^to\(.*\) by\(.*\) on \(.*[[:digit:]]\)$|${NL}${NL}Author:\2${NL}Date: \3${NL}URL: ${url}${NL}Tags:\1${NL}|
,g|^Comments on this post$|s|\(Comments on this post\)|${NL}\1:|
,g| posts on .* at |s|\(.* posts on .* at .*\)|${NL}\1:|
w
EOF
# additional ed commands
# delete line numbers
# ,g|^[[:space:]]*[[:digit:]]\{1,\}[[:space:]]\{1,3\}|s|^[[:space:]]*[[:digit:]]\{1,\}[[:space:]]\{1,3\}\(.*\)$|\1|
# delete range of lines
# 4,11d
cd "${OPWD}"
return 0
}
#----------------------------------------- end of function snippet
declare pflag tflag uflag
declare cnt count dir_name file no_posts_check NL OPWD outputfile postnum tagsite title url urls website
if [[ $# -eq 0 ]]; then
printf "%s\n%s\n" 'No arguments given!' "Usage: ${0##*/} [-p num] [-t tag] [-u user] tag" 1>&2
exit 1
fi
while getopts ":p:t:u:" option
do
case $option in
p) pflag="$OPTARG" ;;
t) tflag="$OPTARG" ;;
u) uflag="$OPTARG" ;;
[?]) printf "%s\n%s\n" 'Argument error!' "Usage: ${0##*/} [-p num] [-t tag] [-u user] tag" 1>&2; exit 1;;
*) ;;
esac
done
shift $(($OPTIND - 1))
if [[ $# -eq 1 ]]; then
dir_name="${1}"
tagsite="${BaseTagURL}/${1}"
elif [[ $# -gt 1 ]]; then
printf "%s\n%s\n" 'Too many arguments!' "Usage: ${0##*/} [-p num] [-t tag] [-u user] tag" 1>&2
exit 1
elif [[ -n "${pflag}" ]]; then
snippet "${pflag}"
exit 0
elif [[ -n "${tflag}" ]] && [[ -n "${uflag}" ]]; then
dir_name="${tflag}-${uflag}"
tagsite="${BaseUserURL}/${uflag}/tag/${tflag}"
elif [[ -n "${tflag}" ]]; then
dir_name="${tflag}"
tagsite="${BaseTagURL}/${tflag}"
elif [[ -n "${uflag}" ]]; then
dir_name="${uflag}"
tagsite="${BaseUserURL}/${uflag}"
else
printf "%s\n%s\n" 'Argument error!' "Usage: ${0##*/} [-p num] [-t tag] [-u user] tag" 1>&2
exit 1
fi
tagsite="${tagsite%/}"
#echo $dir_name
#echo $tagsite
count=1
cnt=0
curl_max_time=20
website=''
no_posts_check=''
NL=$'\\\n'
download_dir="${download_dir}/${dir_name//\//:}"
download_dir="${download_dir%/}"
/bin/mkdir -p "${download_dir}"
OPWD="${PWD}"
cd "${download_dir}"
# print download directory
printf "\n\e[0K\e[1;30m%s\e[0m: %s\n\n" "download directory" "${download_dir}"
while [[ -z "${no_posts_check}" ]]; do
# download website of the form:
# http://somewebsite.com/tag/bash/1,
# http://somewebsite.com/user/name/1 or
# http://somewebsite.com/user/name/tag/bash/1
website="$(/usr/bin/curl -L -s --max-time $curl_max_time "${tagsite}/${count}" )"
if [[ $? -ne 0 ]]; then
printf "\e[0K\e[31m%s\e[0m: %s\n" "curl_max_time ${curl_max_time}" "${tagsite}/${count}"
exit 1
fi
#if [[ -n "$(printf "%s" "${website}" | /usr/bin/egrep -o 'Application error \(Apache\)')" ]]; then
#no_posts_check='Application error (Apache)'
#printf "\e[0K\e[31m%s\e[0m: %s\n" "no further posts" "${no_posts_check}"
#fi
if [[ -n "$(printf "%s" "${website}" | /usr/bin/egrep -o '>No posts<')" ]]; then
no_posts_check='>No posts<'
#printf "\e[0K\e[31m%s\e[0m: %s\n" "no further posts" "${no_posts_check}"
fi
: <<-'COMMENT'
# works for Bash 3.0 or later
if [[ "${website}" =~ '>No posts<' ]]; then
no_posts_check="${BASH_REMATCH[0]}"
#printf "\e[0K\e[31m%s\e[0m: %s\n" "no further posts" "${no_posts_check}"
fi
COMMENT
if [[ -z "${no_posts_check}" ]]; then
# extract relevant post URLs
#urls=( $(printf "%s\n" "${website}" | /usr/bin/sed -E -n -e "s|^.* href=\"(/posts/show/[[:digit:]]+)\".*$|${BaseURL}\1|p;g") )
urls=( $(printf "%s\n" "${website}" | /usr/bin/egrep -o 'href="/posts/show/[[:digit:]]+"' | /usr/bin/sed -E -n -e "s|href=\"(/posts/show/[[:digit:]]+)\"|${BaseURL}\1|p;g") )
for ((i=0; i < "${#urls[@]}"; i++)); do
url="${urls[${i}]}"
postnum="${url##*/}"
file="${download_dir}/${postnum}"
trap '/bin/rm -f "${file}"; exit 0' 0 1 2 13 15
/usr/bin/curl -L -O -s --max-time $curl_max_time "${url}"
if [[ $? -ne 0 ]]; then
printf "\e[0K\e[31m%s\e[0m: %s\n" "curl_max_time ${curl_max_time}" "${url}"
continue
fi
# get title of downloaded web page
#title="$(/usr/bin/sed -E -n -e '/<[tT][iI][tT][lL][eE]>/{s/^.*<[tT][iI][tT][lL][eE]>(.*)<\/[tT][iI][tT][lL][eE]>.*$/\1/p;q;}' "${file}" | \
# /usr/bin/sed -E -e 's/\[[^][:space:]]*\]//g')" # delete [xxx] tag elements of title
title="$(/usr/bin/egrep -m 1 -io '<title>.*</title>' "${file}" | /usr/bin/sed -E -e 's/^<title>[[:space:]]*|[[:space:]]*<\/title>$//g' \
-e 's/\[[^][:space:]]*\]//g')" # delete [xxx] tag elements of title
title="${title//CodeSnippets:/}"
title="${title//\//:}"
title="${title// /_}"
title="${title//[[:cntrl:]]/}"
title="${title%"${title##*[!_]}"}" # remove trailing underscores
#printf "%s\n" "${title}"
if [[ $title == '_CodeDrive_Snippets_courtesy_of_Peter_Coopers_handy_little_app' ]] || [[ -z "$title" ]]; then
printf "\e[0K\e[31m%s\e[0m: %s\n" "couldn't access" "${url}"
/bin/rm "${file}"
continue
fi
outputfile="${download_dir}/${postnum}_${title}.txt"
#outputfile="${download_dir}/${title}.txt" # without post number prefix
#outputfile="${outputfile//__/_}" # uniq underscores
let cnt++
printf "\e[0K\e[1;32m%-6s\e[0m %s\n" "${cnt}" "${outputfile##*/}"
/usr/bin/textutil -output "${outputfile}" -convert txt -inputencoding "${InputEncoding}" -encoding "${OutputEncoding}" "${file}"
/bin/rm "${file}"
# escape backslashes
# man bash 2>/dev/null | less -p 'Each command in a pipeline'
#outputfile="$(printf "%q" "${outputfile}")" # cf. help printf
outputfile="${outputfile//\\/\\\\}"
# edit $outputfile in-place with man ed
# first delete lines at the beginning & end,
# then remove the string 'See related posts' and add some newlines with $NL,
# then convert the line 'to...by...on' to line 'Author:...', line 'Date:...', line 'URL:...' and line 'Tags:...'
# and finally the last two ed commands insert two further newlines with $NL
cat <<EOF | /bin/ed -s "${outputfile}"
H
,g/Snippets is a public source code repository/1,/Snippets is a public source code repository/d
,g/You need to create an account or log in to post comments to this site//You need to create an account or log in to post comments to this site/,\$d
,g|(See related posts)$|s|.See related posts.|${NL}${NL}|
,g|^to.* by.* on .*[[:digit:]]$|s|^to\(.*\) by\(.*\) on \(.*[[:digit:]]\)$|${NL}${NL}Author:\2${NL}Date: \3${NL}URL: ${url}${NL}Tags:\1${NL}|
,g|^Comments on this post$|s|\(Comments on this post\)|${NL}\1:|
,g| posts on .* at |s|\(.* posts on .* at .*\)|${NL}\1:|
w
EOF
# additional ed commands
# delete line numbers
# ,g|^[[:space:]]*[[:digit:]]\{1,\}[[:space:]]\{1,3\}|s|^[[:space:]]*[[:digit:]]\{1,\}[[:space:]]\{1,3\}\(.*\)$|\1|
# delete range of lines
# 4,11d
done # for
let count++
fi
done # while
cd "${OPWD}"
exit 0