#!/usr/bin/perl

############################################################################
# author: Sebastian Lange, mailto:info@sl-chat.de
# date: 2000-05-16
# version: v1.0
#
# adds: mainly inspired by The Perl Cookbook's Example 16-7
# adds: reference http://www.w3.org/People/Raggett/tidy/
#
# what:
#	-cleaning HTML input using Dave Raggett's HTML Tidy
#	- including optional possibility to remove non-body document tags
#	- additional option to have JavaScript and CSS containers returned 
#		together with body
#
# to-do:
#	- better documentation
#	- some code cleanup
#	- meaningful error messages, improved error detection
#	- actual implementation on a webserver
#	- some more fixes for tidy flaws (unless tidy gets to take care of 
#		them by itself one day)
#
# methods:
#	tidyString(string STRING[, int NODOCTAGS])
#		what:
#			- pass a string to HTML Tidy, parse and return the output
#
#		parameter:	
#			- STRING: any string containing HTML tags
#			- optional NODOCTAG: switch either to leave tidy-result 
#				untouched (0) or to cut non-body parts (1) or to return
#				JavaScript and CSS containtes with the body (2)
#				default: 1
#		result:
#			- die(error message) in case of error
#			- STRING if there's no valid HTML tag in STRING
#			- STRING: tidy-parsed HTML-file
#
#		example:
#			-
#
#		note:
#			- the result will be the original STRING, if there are no
#				HTML-Tags in the file
#
############################################################################

# globals:
	$g_tidyPath		= "";			# tidy location
	$g_tidyVersion	= "30apr00";	# tidy version
	$g_tidyOptions	= "";			# tidy options (refer to tidy manual)
		
		## if you tell tidy to use the following config file,
		#	you don't need to specify further options
		#
		$g_tidyOptions .= " -config $g_tidyPath/tidy.conf";
		#
		##
		$g_tidyOptions .= " -quiet";
		$g_tidyOptions .= " -f /dev/null";
		
		## following are a couple sample options, for more info please
		##	refer to: http://www.w3.org/People/Raggett/tidy/
		
		#$g_tidyOptions .= " -latin1";
		#$g_tidyOptions .= " --wrap 0";
		#$g_tidyOptions .= " --wrap-attributes no";
		#$g_tidyOptions .= " --indent auto";
		#$g_tidyOptions .= " --tidy-mark no";
		#$g_tidyOptions .= " --uppercase-tags yes";
		#$g_tidyOptions .= " --uppercase-attributes yes";
		#$g_tidyOptions .= " --alt-text '[]'"; # handle this with care
		#$g_tidyOptions .= " --doctype '-//W3C//DTD HTML 4.0 Transitional//EN'";
		#$g_tidyOptions .= " --doctype '-//W3C//DTD HTML 3.2//EN'";
		#$g_tidyOptions .= " --quote-marks yes";
		#$g_tidyOptions .= " --enclose-text yes";
		#$g_tidyOptions .= " --";
		#$g_tidyOptions .= " --";


### BEGIN sub tidyString ###
sub tidyString {
	# usage: tidyString(string STRING[, int NODOCTAGS])
	# result: STRING with tidy-parsed HTML
	die("usage: tidyString(string STRING[, int NODOCTAGS])") unless @_;
	return ($_[0]) if ($_[0] !~ /<[^>]*>/);	# return STRING if it has no HTML tags, nothing to parse
	if (!defined $_[1]) { $_[1] = 1; }		# NODOCTAGS shall default to 1, if omitted
											# (set it to 0 if you do want to receive 
											# a complete HTML document; to 2 if SCRIPT and STYLE
											# tags in the header shall be returned with the body)

	pipe (READ_FROM_PARENT, WRITE_TO_CHILD) or die "can't pipe(1): $!";
	pipe (READ_FROM_CHILD, WRITE_TO_PARENT) or die "can't pipe(2): $!";
	$parent = fork();
	die ("fork failed: $!") unless defined $parent;
	
	unless ($parent) {
		# we are the child

		my $tidy = $g_tidyPath . "/tidy" . $g_tidyVersion;

		# close obsolete file handles
		close(READ_FROM_CHILD);
		close(WRITE_TO_CHILD);

		# redirect STDIN
		close(STDIN);
		open(STDIN, "<&READ_FROM_PARENT");

		# redirect STDOUT
		close(STDOUT);
		open(STDOUT, ">&WRITE_TO_PARENT");

		select(STDIN);
		$| = 1;
		
		# switch process to tidy
		exec("$tidy$g_tidyOptions") or die("exec of \'$tidy\' failed: $!");

		# END of child
	} else {
		# we are the parent

		my $tidied_data = "";

		# close obsolete file handles
		close(READ_FROM_PARENT);
		close(WRITE_TO_PARENT);
		
		# print the to-be-tidied string to the child's STDIN
		print WRITE_TO_CHILD $_[0];
		# close the file handle
		close(WRITE_TO_CHILD);

		# read everything from the child's STDOUT into $tidied_data
		while (<READ_FROM_CHILD>) {
			$tidied_data .= $_;
		}

		# close the file handle
		close(READ_FROM_CHILD);

		# reap the child
		waitpid($parent, 0);

		if ($_[1] > 0) {
			# remove unwanted document tags
			# (we want only what is between <BODY> and </BODY>)

			if ($tidied_data !~ /<BODY[^>]*>/i) {
				# cancel, if no body
				return ("[could not find BODY in message]");
			}

			if ($_[1] > 1) {
				# we'd like to have JavaScript and CSS codes returned
				my $JavaScript = "";
				my $CSS = "";

				$tidied_data =~ s\(<SCRIPT[^>]*>.*?</SCRIPT[^>]*>)\$1\si;
				$JavaScript = $1 . "\n" unless ($1 eq "");

				$tidied_data =~ s\(<STYLE[^>]*>.*?</STYLE[^>]*>)\$1\si;
				$CSS = $1 . "\n" unless ($1 eq "");

				# add the JavaScript and StyleSheet tags to the output
				# at the beginning of BODY
				$tidied_data = $JavaScript . $CSS . $tidied_data;
				$tidied_data =~ s/(<BODY[^>]*>\n?)/$JavaScript$CSS$1/si;

			}
		
			# strip everything before <BODY> and after </BODY>
			$tidied_data =~ s/^.*<BODY[^>]*>\n?//si;
			$tidied_data =~ s\</BODY[^>]*>.*</HTML[^>]*>.*$\\si;
		}

		# drop empty FONT tags (tidy 30apr00 only drops empty P's)
		$tidied_data =~ s/<FONT[^>]*><\/FONT[^>]*>\n?//gsi;
		# drop known false attributes
		while ($tidied_data =~ /\s*(BORDERCOLOR([a-z]*))\s*=\s*".*?"/i) {
			$tidied_data =~ s/\s*$1\s*=\s*".*?"//gsi;
		}
	
		# drop unknown empty attributes, ALT shall be allowed to be empty
		while ($tidied_data =~ /\s*([a-z]+)\s*=\s*""/i) {
			$myEmptyAttrib = $1;
			if ($myEmptyAttrib =~ /ALT/i) {
				# replace the recognized empty attribute with a placeholder
				$myEmptyAttribStr = "myEmpty" . $myEmptyAttrib . "Attrib";
				$tidied_data =~ s/(<[^>]*?)\s(?:[a-z]+)\s*=\s*""([^>]*>)/$1 $myEmptyAttribStr$2/gsi;
			} else {
				# drop the unrecognized empty attribute
				$tidied_data =~ s/(<[^>]*?)\s(?:[a-z]+)\s*=\s*""([^>]*>)/$1$2/gsi;
			}
		}
		# replace the placeholder with the appropriate empty attribute
		$tidied_data =~ s/myEmpty([a-z]+)Attrib/$1=""/gsi;

		return $tidied_data;

		# END of parent
	}
}
### END sub tidyString ###
