#!/usr/bin/ksh93

########################################################################
#                                                                      #
#               This software is part of the ast package               #
#                 Copyright (c) 2007-2013 Roland Mainz                 #
#                      and is licensed under the                       #
#                 Eclipse Public License, Version 1.0                  #
#                    by AT&T Intellectual Property                     #
#                                                                      #
#                A copy of the License is available at                 #
#          http://www.eclipse.org/org/documents/epl-v10.html           #
#         (with md5 checksum b35adb5213ca9657e911e9befb180842)         #
#                                                                      #
#                                                                      #
#                 Roland Mainz <roland.mainz@nrubsig.org>              #
#                                                                      #
########################################################################

#
# Copyright (c) 2007, 2013, Roland Mainz. All rights reserved.
#

#
# Written by Roland Mainz <roland.mainz@nrubsig.org>
#

#
# rssread - a simple RSS2.0 reader with RSS to XHTML to
# plaintext conversion.
#

# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant
export PATH='/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin'

function printmsg
{
	print -u2 -f "$@"
	return 0
}

function debugmsg
{
#	printmsg "$@"
true
}

function fatal_error
{
	print -u2 -n "${progname}: "
	print -u2 -f "$@"
	exit 1
}

typeset -T urlconnection_t=(
	# public
	typeset user_agent='ksh93/urlconnection_t'

	# private variables
	typeset protocol
	typeset path1
	typeset host
	typeset path
	typeset port
	
	compound netfd=(
		integer in=-1  # incoming traffic
		integer out=-1 # outgoing traffic
	)

	# only used for https
	compound ssl=(
		compound fifo=(
			typeset dir=''
			typeset in=''
			typeset out=''
		)
		integer openssl_client_pid=-1
	)

	# parse HTTP return code, cookies etc.
	function parse_http_response
	{
		nameref response="$1"
		typeset h statuscode statusmsg s
		integer i
	    
		# we use '\r' as additional IFS to filter the final '\r'
		IFS=$' \t\r' read -r h statuscode statusmsg  # read HTTP/1.[01] <code>
		[[ "${h}" != ~(Eil)HTTP/ ]]           && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; }
		[[ "${statuscode}" != ~(Elr)[0-9]+ ]] && { print -u2 -f $"%s: invalid status code\n"  "$0" ; return 1 ; }

		integer response.statuscode="10#${statuscode}"
		typeset response.statusmsg="${statusmsg}"
	    
	    	typeset -a response.headers
		
		# collect headers
		while IFS='' read -r s ; do
			[[ "${s}" == $'\r' ]] && break
	
			# strip '\r' at the end
			s="${s/~(Er)$'\r'/}"
			
			response.headers+=( "${s}" )
		done
	
		for (( i=0 ; i < ${#response.headers[@]} ; i++ )) ; do
			s="${response.headers[i]}"
			# add compound variable fields _ONLY_ on _demand_ if the
			# matching headers exist
			case "${s}" in
				~(Eli)Content-Length:[[:blank:]]+[0-9]+)
					integer response.content_length="10#${s/~(Eli)Content-Length:[[:blank:]]+/}"
					;;
				~(Eli)Content-Type:[[:blank:]]+)
					typeset response.content_type="${s/~(Eli)Content-Type:[[:blank:]]+/}"
					;;
				~(Eli)Location:[[:blank:]]+)
					typeset response.location="${s/~(Eli)Location:[[:blank:]]+/}"
					;;
				~(Eli)Transfer-Encoding:[[:blank:]]+)
					typeset response.transfer_encoding="${s/~(Eli)Transfer-Encoding:[[:blank:]]+/}"
					;;
			esac
		done
		
		return 0
	}

	function cat_http_body
	{
		typeset emode="$1"
		typeset hexchunksize='0'
		integer chunksize=0

		if [[ "${emode}" == 'chunked' ]] ; then
			while IFS=$'\n' read hexchunksize ; do
				hexchunksize="${hexchunksize//$'\r'/}"
				[[ "${hexchunksize}" != '' ]] || continue
				[[ "${hexchunksize}" == ~(Elr)[[:xdigit:]]+ ]] || break
				chunksize="16#${hexchunksize}"
				(( chunksize > 0 )) || break
				dd bs=1 count="${chunksize}" 2>'/dev/null'
			done
		else
			cat
		fi

		return 0
	}
	
	function init_url
	{
		_.protocol="${1%://*}"
		_.path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html"

		if  [[ "${_.protocol}" == ~(Elr)http(|s) ]] ; then
			_.host="${_.path1%%/*}"
			_.path="${_.path1#*/}"
			_.port="${_.host##*:}"
		fi
		
		return 0
	}
	
	# close connection
	function close_connection
	{
		integer ret

		if (( _.netfd.in != -1 )) ; then
			redirect {_.netfd.in}<&-
			(( _.netfd.in=-1 ))
		fi
		
		if (( _.netfd.in != _.netfd.out && _.netfd.out != -1 )) ; then
			redirect {_.netfd.out}<&-
			((  _.netfd.out=-1 ))
		fi

		if [[ "${_.protocol}" == 'https' ]] ; then
			wait ${_.ssl.openssl_client_pid} || { print -u2 -f $"%s: openssl failed.\n" "${0}" ; return 1 ; }
			(( _.ssl.openssl_client_pid=-1 ))
				
			rm -R "${_.ssl.fifo.dir}"
			_.ssl.fifo.dir=''
		fi
						
		return 0
	}
	
	function open_connection
	{
		if [[ "${_.protocol}" == 'https' ]] ; then
			_.ssl.fifo.dir="$(mktemp -t -d)"
			_.ssl.fifo.in="${_.ssl.fifo.dir}/in"
			_.ssl.fifo.out="${_.ssl.fifo.dir}/out"

			# Use "errexit" to leave it at the first error
			# (this saves lots of if/fi tests for error checking)
			set -o errexit

			mkfifo "${_.ssl.fifo.in}" "${_.ssl.fifo.out}"

			# create async openssl child to handle https
			openssl s_client -quiet -connect "${_.host}:${_.port}" <"${_.ssl.fifo.in}" >>"${_.ssl.fifo.out}" &
			
			(( _.ssl.openssl_client_pid=$! ))
		else
			redirect {_.netfd.in}<> "/dev/tcp/${_.host}/${_.port}"
			(( $? != 0 )) && { print -u2 -f $"%s: Could not open %q\n" "${0}" "${_.host}/${_.port}" ; return 1 ; }
			(( _.netfd.out=_.netfd.in ))
		fi
		return 0
	}

	function send_request
	{
		typeset request="$1"
		
		set -o errexit
		
		if [[ "${_.protocol}" == 'https' ]] ; then
				print -n -- "${request}\r\n" >>	"${_.ssl.fifo.in}"

				redirect {_.netfd.in}< "${_.ssl.fifo.out}"
		else
				print -n -- "${request}\r\n" >&${_.netfd.out}
		fi
		return 0
	}
	
	function cat_url
	{
		integer res

		if [[ "${_.protocol}" == 'file' ]] ; then
			cat "${_.path1}"
			return $?
		elif [[ "${_.protocol}" == ~(Elr)http(|s) ]] ; then
			compound httpresponse # http response

			# If URL did not contain a port number in the host part then look at the
			# protocol to get the port number
			if [[ "${_.port}" == "${_.host}" ]] ; then
				case "${_.protocol}" in
					'http')  _.port=80 ;;
					'https') _.port=443 ;;
					*)       _.port="$(getent services "${_.protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;;
				esac
			else
				_.host="${_.host%:*}"
			fi

			printmsg "protocol=%q port=%q host=%q path=%q\n" "${_.protocol}" "${_.port}" "${_.host}" "${_.path}"

			# prechecks
			[[ "${_.protocol}" != '' ]] || { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; }
			[[ "${_.port}"     != '' ]] || { print -u2 -f "%s: port not set.\n"     "$0" ; return 1 ; }
			[[ "${_.host}"     != '' ]] || { print -u2 -f "%s: host not set.\n"     "$0" ; return 1 ; }
			[[ "${_.path}"     != '' ]] || { print -u2 -f "%s: path not set.\n"     "$0" ; return 1 ; }

			_.open_connection || return 1

			# send HTTP request    
			request="GET /${_.path} HTTP/1.1\r\n"
			request+="Host: ${_.host}\r\n"
			request+="User-Agent: ${_.user_agent}\r\n"
			request+='Connection: close\r\n'
			_.send_request "${request}\r\n"

			# collect response and send it to stdout
			{
				_.parse_http_response httpresponse ; (( res=$? ))
				if (( res == 0 )) ; then
					_.cat_http_body "${httpresponse.transfer_encoding-}" ; (( res+=$? ))
				fi

				if [[ -v httpresponse.statuscode ]] ; then
					if ! (( httpresponse.statuscode >= 200 && httpresponse.statuscode <= 299 )) ; then
						case "${httpresponse.statuscode}" in
							'301'|'302'|'303')
								print -u2 -f $"# HTTP redirect (%s) to %q\n" "${httpresponse.statuscode}" "${httpresponse.location}"
								;;
							*)
								print -u2 -f $"Unexpected http status code %d\n" httpresponse.statuscode
								;;
						esac
						(( res++ ))
					fi
				else
					(( res++ ))
				fi
			} <&${_.netfd.in}
			
			_.close_connection
			
			return ${res}
		else
			return 1
		fi
		# notreached
	}
)

function html_entity_to_ascii
{
	typeset inbuf=''
	typeset outbuf=''
	
	while read -r inbuf ; do
		html_entity_to_ascii_string outbuf "${inbuf}"
		printf '%s\n' "${outbuf}"
	done

	return 0
}


function html_entity_to_ascii_string
{
	nameref outbuf=$1
	typeset inbuf="$2"
	integer inbuf_index=0

	typeset entity
	typeset c
	typeset value

	outbuf=''
	
	# Todo: Add more HTML/MathML entities here
	# Note we use a static variable (typeset -S) here to make sure we
	# don't loose the cache data between calls
	typeset -S -A entity_cache=(
		# entity to ascii (fixme: add UTF-8 transliterations)
		["nbsp"]=' '
		["lt"]='<'
		["le"]='<='
		["gt"]='>'
		["ge"]='>='
		["amp"]='&'
		["quot"]='"'
		["apos"]="'"
	)
    
	while c="${inbuf:inbuf_index++:1}" ; [[ "${c}" != '' ]] ; do
		if [[ "${c}" != '&' ]] ; then
			outbuf+="${c}"
			continue
		fi
        
		entity=''
		while c="${inbuf:inbuf_index++:1}" ; [[ "${c}" != '' ]] ; do
			case "${c}" in
				';')
				break
				;;
			~(Eilr)[a-z0-9#])
				entity+="${c}"
				continue
				;;
			*)
#				debugmsg "error &${entity}${c}#"

				outbuf+="${entity}${c}"
				entity=''
				continue 2
				;;
			esac
		done
        
		value=''
		if [[ -v entity_cache["${entity}"] ]] ; then
#			debugmsg "match #${entity}# = #${entity_cache["${entity}"]}#"
			value="${entity_cache["${entity}"]}"
		else
			if [[ "${entity:0:1}" == '#' ]] ; then
				# decimal literal
				value="${ printf "\u[${ printf "%x" "${entity:1:8}" ; }]" ; }"
			elif [[ "${entity:0:7}" == ~(Eilr)[0-9a-f]+ ]] ; then
				# hexadecimal literal
				value="${ printf "\u[${entity:0:7}]" ; }"
			else
				# unknown literal - pass-through
				value="ENT=|${entity}|"
			fi

			entity_cache["${entity}"]="${value}"

#			debugmsg "lookup #${entity}# = #${entity_cache["${entity}"]}#"
		fi

		outbuf+="${value}"
	done

	return 0
}

# dumb xhtml handler - no CSS,  tables, images, iframes or nested
# structures are supported (and we assume that the input is correct
# xhtml). The code was written in a trial&&error manner and should be
# rewritten to parse xhtml correctly.
function handle_html
{
    # we can't use global variables here when multiple callbacks use the same
    # callback function - but we can use the callback associative array for
    # variable storage instead
    nameref callbacks=${1}
    typeset tag_type="$2"
    typeset tag_value="$3"

    case "${tag_type}" in
        tag_begin)
            case "${tag_value}" in
                br) printf '\n' ;;
                hr) printf '\n-------------------------------------\n' ;;
                pre) callbacks["html_pre"]='true' ;;
                p)  printf '\n' ;;
            esac
            ;;

        tag_end)
            case "${tag_value}" in
                pre) callbacks["html_pre"]='false' ;;
            esac
            ;;

        tag_text)
            if ${callbacks["html_pre"]} ; then
                printf '%s' "${tag_value}"
            else
                # compress spaces/newlines/tabs/etc.
                printf '%s' "${tag_value//+([[:space:]])/ }"
            fi
            ;;

        document_start)
            callbacks["html_pre"]='false'
            ;;
        document_end) ;;
    esac

    return 0
}

function handle_rss
{
	# we can't use global variables here when multiple callbacks use the same
	# callback function - but we can use the callback associative array for
	# variable storage instead
	nameref callbacks=${1}
	typeset tag_type="$2"
	typeset tag_value="$3"

	case "${tag_type}" in
		tag_begin)
			case "${tag_value}" in
				item)
					unset \
						item["title"] \
						item["link"] \
						item["tag"] \
						item["description"]
					;;
			esac
			callbacks["textbuf"]=''
			;;
		tag_end)
			case "${tag_value}" in
				item)
					# note that each RSS item needs to be converted seperately from RSS to HTML to plain text
					# to make sure that the state of one RSS item doesn't affect others
					(
						printf $"<br />#### RSS item: title: %s ####" "${item["title"]}"
						[[ -v item["author"]	]] && printf $"<br />## author: %s" "${item["author"]}"
						[[ -v item["link"]	]] && printf $"<br />## link:   %s" "${item["link"]}"
						[[ -v item["pubDate"]	]] && printf $"<br />## date:   %s" "${item["pubDate"]}"
						if [[ -v item["description"] ]] ; then
							printf $"<br />## begin description:"
							printf $"<br />%s<br />" "${item["description"]}"
							printf $"<br />## end description<br />"
						fi
						print # extra newline to make sure the sed pipeline gets flushed
					) | 
						html_entity_to_ascii |	# convert XML entities (e.g. decode RSS content to HTML code)
						xml_tok "xhtmltok_cb" |	# convert HTML to plain text
						html_entity_to_ascii	# convert HTML entities
					;;
				title)                item["title"]="${callbacks["textbuf"]}"        ; callbacks["textbuf"]='' ;;
				link)                 item["link"]="${callbacks["textbuf"]}"         ; callbacks["textbuf"]='' ;;
				dc:creator | author)  item["author"]="${callbacks["textbuf"]}"       ; callbacks["textbuf"]='' ;;
				dc:date | pubDate)    item["pubDate"]="${callbacks["textbuf"]}"      ; callbacks["textbuf"]='' ;;
				description)          item["description"]="${callbacks["textbuf"]}"  ; callbacks["textbuf"]='' ;;
			esac
			callbacks["textbuf"]=''
			;;
		tag_text)
			callbacks["textbuf"]+="${tag_value}"
			;;
		document_start)
			;;
		document_end)
			;;
	esac

	return 0
}

function xml_tok
{
    typeset buf=''
    typeset namebuf=''
    typeset attrbuf=''
    typeset c=''
    bool isendtag	# bool: true/false
    bool issingletag	# bool: true/false (used for tags like "<br />")
    nameref callbacks=${1}
    
    [[ -v callbacks["document_start"] ]] && ${callbacks["document_start"]} "${1}" "document_start"

    while IFS='' read -r -N 1 c ; do
        isendtag=false
        
        if [[ "$c" == "<" ]] ; then
	    # flush any text content
            if [[ "$buf" != '' ]] ; then
                [[ -v callbacks["tag_text"] ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf"
                buf=''
            fi
            
            IFS='' read -r -N 1 c
            if [[ "$c" == '/' ]] ; then
                isendtag=true
            else
                buf="$c"
            fi
            IFS='' read -r -d '>' c
            buf+="$c"
	    
	    # handle comments
	    if [[ "$buf" == ~(El)!-- ]] ; then
	        # did we read the comment completely ?
	        if [[ "$buf" != ~(Elr)!--.*-- ]] ; then
		    buf+=">"
	            while [[ "$buf" != ~(Elr)!--.*-- ]] ; do
		        IFS='' read -r -N 1 c || break
		        buf+="$c"
		    done
		fi
	    
		[[ -v callbacks["tag_comment"] ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}"
		buf=''
		continue
	    fi
	    
	    # check if the tag starts and ends at the same time (like "<br />")
	    if [[ "${buf}" == ~(Er).*/ ]] ; then
	        issingletag=true
		buf="${buf%*/}"
	    else
	        issingletag=false
	    fi
	    
	    # check if the tag has attributes (e.g. space after name)
	    if [[ "$buf" == ~(E)[[:space:]] ]] ; then
	        namebuf="${buf%%~(E)[[:space:]].*}"
                attrbuf="${buf#~(E).*[[:space:]]}"
            else
	        namebuf="$buf"
		attrbuf=''
	    fi
	    
            if (( isendtag )) ; then
                [[ -v callbacks["tag_end"] ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
            else
                [[ -v callbacks["tag_begin"] ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf"

                # handle tags like <br/> (which are start- and end-tag in one piece)
                if (( issingletag )) ; then
                    [[ -v callbacks["tag_end"] ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
                fi
            fi
            buf=''
        else
            buf+="$c"
        fi
    done

    [[ -v callbacks["document_end"] ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success"
    
    print # final newline to make filters like "sed" happy
    return 0
}

# return the value of LC_MESSAGES needed for subprocesses which
# want to run in a different locale/encoding
function get_lc_messages
{
	# The statement below is a compact version of this script:
	#[[ "${LC_ALL-}"       != '' ]] && { print -- "${LC_ALL}"      ; return 0 ; }
	#[[ "${LC_MESSAGES-}"  != '' ]] && { print -- "${LC_MESSAGES}" ; return 0 ; }
	#[[ "${LANG-}"         != '' ]] && { print -- "${LANG}"        ; return 0 ; }
	#print 'C' ; return 0
	printf '%s\n' "${LC_ALL:-${LC_MESSAGES:-${LANG:-"C"}}}"
	return 0
}

function do_rssread
{
	integer res

	# set unicode locale since RSS is encoded in UTF-8
	# (and make sure $LC_MESSAGES is set to the parent
	# process's locale that all error messages are using
	# the callers locale/encoding)
	export \
		LC_MESSAGES="${ get_lc_messages ; }" \
		LC_MONETARY='en_US.UTF-8' \
		LC_NUMERIC='en_US.UTF-8' \
		LC_COLLATE='en_US.UTF-8' \
		LC_CTYPE='en_US.UTF-8' \
		LC_TIME='en_US.UTF-8' \
		LANG='en_US.UTF-8'

	# return non-zero exit code for this function if the rss processing below fails
	set -o errexit

	urlconnection_t hc
	hc.user_agent="rssread/ksh93(ssl) (2013-07-26; $(uname -s -r -p))"
	hc.init_url "$1"
	
	# need extra newline after cat_url to terminate line with $'\n'
	# to make "xml_tok" happy
	data="${ hc.cat_url ; (( res=$? )) ; print ; }"
	
	print -u2 -f $"# Got %d lines of RSS data, processing...\n" "${ wc -l <<< "${data}" ; }"
	
	xml_tok 'rsstok_cb' <<< "${data}"

	return ${res}
}

function usage
{
	OPTIND=0
	getopts -a "${progname}" "${rssread_usage}" OPT '-?'
	exit 2
}

function main
{
	bool noiconv=false
	
	while getopts -a "${progname}" "${rssread_usage}" OPT ; do 
		case "${OPT}" in
			'I')	noiconv=true	;;
			'+I')	noiconv=false	;;
			*)	usage		;;
		esac
	done
	shift $(( OPTIND-1 ))
	
	typeset url="$1"
	
	if [[ "${url}" == '' ]] ; then
		fatal_error $"No url given.\n"
	fi
	
	if [[ -v bookmark_urls[${url}] ]] ; then
		printmsg $"Using bookmark %q = %q\n" "${url}" "${bookmark_urls[${url}]}"
		url="${bookmark_urls[${url}]}"
	fi
	
	if (( noiconv )) ; then
		do_rssread "${url}"
	else
		( set -o pipefail ; do_rssread "${url}" | iconv -f 'UTF-8' - )
	fi

	return $?
}

# make sure we use the ksh93 builtin versions
builtin basename
builtin cat
builtin mkfifo

set -o noglob
set -o nounset

typeset -A rsstok_cb # callbacks for xml_tok
rsstok_cb["tag_begin"]='handle_rss'
rsstok_cb["tag_end"]='handle_rss'
rsstok_cb["tag_text"]='handle_rss'
rsstok_cb["textbuf"]=''

typeset -A xhtmltok_cb # callbacks for xml_tok
xhtmltok_cb["tag_begin"]='handle_html'
xhtmltok_cb["tag_end"]='handle_html'
xhtmltok_cb["tag_text"]='handle_html'
xhtmltok_cb["textbuf"]=''
xhtmltok_cb["html_pre"]='false'

typeset -A item

# "ramdom" urls for testing
typeset -r -A bookmark_urls=(
	['google_blogs_ksh']='https://www.google.com/search?hl=en&q=(%22ksh93%22%7C%22ksh%2093%22%20%7C%20%22korn93%22%20%7C%20%22korn%2093%22)&ie=utf-8&tbm=blg&tbs=sbd:1&num=100&output=rss'
	# some Sun staff/sites
	['blogs_sun_com']='http://blogs.oracle.com/main/feed/entries/rss'
	['bigadmin']='http://www.sun.com/bigadmin/content/rss/motd.xml'
	['bigadmin_scripts']='https://www.sun.com/bigadmin/content/rss/scripts.xml'
	['jmcp']='http://www.jmcp.homeunix.com/roller/jmcp/feed/entries/rss'
	['katakai']='http://blogs.oracle.com/katakai/feed/entries/rss'
	['alanc']='http://blogs.oracle.com/alanc/feed/entries/rss'
	['planetsun']='http://www.planetsun.org/rss20.xml'
	['planetsolaris']='http://www.planetsolaris.org/rss20.xml'
	['planetopensolaris']='http://planet.opensolaris.org/rss20.xml'
	['theregister_uk']='http://www.theregister.co.uk/headlines.rss'
	['heise']='http://www.heise.de/newsticker/heise.rdf'
	['slashdot']='http://rss.slashdot.org/Slashdot/slashdot'
	['wikipedia_command_shells']='http://en.wikipedia.org/w/index.php?title=Comparison_of_command_shells&feed=rss&action=history'
)

typeset progname="${ basename "${0}" ; }"

typeset -r rssread_usage=$'+
[-?\n@(#)\$Id: rssread (Roland Mainz) 2013-07-26 \$\n]
[-author?Roland Mainz <roland.mainz@nrubsig.org>]
[+NAME?rssread - fetch RSS messages and convert them to plain text]
[+DESCRIPTION?\brssread\b RSS to plain text converter
        which fetches RSS streams via HTTP and converts them from
	RSS to HTML to plain text in the current locale/encoding.]
[I:noiconv?Do not convert data from UTF-8 to current locale/encoding.]

[ url ]

[+SEE ALSO?\bksh93\b(1), \bshnote\b(1) \bshtinyurl\b(1)]
'

main "$@"
exit $?

#EOF.
