#!/usr/bin/env perl 
#
# roffit: convert man page source files to HTML
#
# Read an nroff file. Output a HTML file.
#
# This is a very simple script, but I use it on very simple man pages as I've
# found no decent script that makes really nice web pages.
#
# Author:  Daniel Stenberg <daniel@haxx.se>
# Web:     http://daniel.haxx.se/projects/roffit

my $version = "0.8";

use strict;
#use warnings;

my $InFH = \*STDIN;
my $OutFH = \*STDOUT;
my $debugFH = \*STDERR;

my %manpage;
my @out;

my $indentlevel=0; # logical levels, not columns
my @p;
my $within_tp;
my $standalone=1; # by default we make stand-alone HTML pages
my $pre;
my %anchor; # hash with all anchors

my $mandir;
my $hrefdir=".";

while($ARGV[0]) {
    if($ARGV[0] eq "--bare") {
        # don't include headers and stuff
        $standalone=0;
        shift @ARGV;
    }
    elsif($ARGV[0] eq "--version") {
        print "roffit $version (http://daniel.haxx.se/projects/roffit/)\n";
        exit;
    }
    elsif($ARGV[0] =~ /^--mandir=(.*)/) {
        # check for other HTMLized man pages in this given dir
        $mandir=$1;
        shift @ARGV;
    }
    elsif($ARGV[0] =~ /^--hrefdir=(.*)/) {
        # if another manpage is found in the --mandir, this is the dir we
        # prefix the HTML file name with in the output HTML
        $hrefdir=$1;
        shift @ARGV;
    }
    else {
        printf $debugFH "unknown option: %s\n", $ARGV[0] if($ARGV[0] ne "-h");
        print $debugFH "Usage: roffit [options] < infile > outfile\n",
        "Options:\n",
        " --version       display roffit version and exit\n",
        " --bare          do not put in HTML, HEAD, BODY tags\n",
        " --mandir=<dir>  check for other HTMLized man pages in this dir\n",
        " --hrefdir=<dir> if a manpage is found in the --mandir, this is the\n",
        "                 dir we prefix the HTML file name with in the output\n";
        exit;
    }
}

sub showp {
    my @p = @_;
    push @out, "\n" if(!$pre);
    push @out, "<p class=\"level$indentlevel\">", @p;
}

sub defaultcss {
    print $OutFH <<ENDOFCSS
<STYLE type="text/css">
P.level0 {
 padding-left: 2em;
}

P.level1 {
 padding-left: 4em;
}

P.level2 {
 padding-left: 6em;
}

span.emphasis {
 font-style: italic;
}

span.bold {
 font-weight: bold;
}

span.manpage {
 font-weight: bold;
}

h2.nroffsh {
 background-color: #e0e0e0;
}

span.nroffip {
 font-weight: bold;
 font-size: 120%;
 font-family: monospace;
}

p.roffit {
 text-align: center;
 font-size: 80%;
}
</STYLE>
ENDOFCSS
    ;
}

sub text2name {
    my ($text) = @_;
    $text =~ s/^ *([^ ]*).*/$1/g;
    $text =~ s/[^a-zA-Z0-9-]//g;
    return $text;
}

# scan through the file and check for <span> sections we should convert
# to proper links
sub linkfile {
    my @new;
    for(@out) {
        my $line=$_;
        my $l;
        while($line =~ s/<span class=\"(manpage|emphasis|bold)\">([^<]*)<\/span>/[]/) {
            my ($style, $name)=($1, $2);
            
            $l = text2name($name);

            my $link;
            if($anchor{$l}) {
                $link="<a class=\"$style\" href=\"#$l\">$name</a>";
            }
            else {
                if(($name =~ /^ *(\w*) *\( *(\d*) *\) *$/) && ($mandir)) {

                    # this looks like a reference to another man page, and we
                    # have asked for this feature! We check for the specified
                    # nroff file and not the HTML version, to avoid depending
                    # on which order the set of files are converted to HTML!
                    my ($symbol, $section)=($1, $2);
                    my $file = "$mandir/$symbol.$section";

                    if(-r $file) {
                        my $html = "$hrefdir/$symbol.html";
                        # there is such a HTML file present, produce a link
                        # to it!
                        $link="<a class=\"$style\" href=\"$html\">$name</a>";
                    }
                }
                if(!$link) {
                    $link="<span Class=\"$style\">$name</span>";
                }
            }
            $line =~ s/\[\]/$link/;
        }
        # convert lowercase ftp:// or http:// links to <a href> links
        $line =~ s/((http|ftp):\/\/([a-z0-9.\/_%-]*[a-z\/]))/<a href=\"$2:\/\/$3\">$1<\/a>/g;

        # convert (uppercase only) "RFC [number]" to a link
        $line =~ s/RFC *(\d+)/<a href=\"http:\/\/www.ietf.org\/rfc\/rfc$1.txt\">RFC $1<\/a>/;

        push @new, $line;
    }
    return @new;
}

sub handle_italic_bold ($) {
    $_ = shift;
    $_ =~ s/\\fI/<span class=\"emphasis\">/g;
    $_ =~ s/\\fB/<span class=\"bold\">/g;
    $_ =~ s/\\f[PR]/<\/span>/g;
    return $_;
}

sub parsefile {
    my $lineno;
    while(<$InFH>) {
        $lineno++;
        my $in = $_;
        my $out;
  #     print $debugFH "DEBUG IN: $_";

        $in =~ s/[\r\n]//g if(!$pre); # tear off newlines

        $in =~ s/\r//g; # remote carriage return always

        # substitue all '\-' sequences with '-'.
        # '\-' might be in the source file to specify a literal
        # '-' rather than a hyphen.
        $in =~ s/\\-/-/g;

        if($in =~ /^\'\\\"/) {
            # ignore this kind of weird comment
            $out ="";
        }
        elsif($in =~ /^\.([^ \t\n]*)(.*)/) {
            # this is a line starting with a dot, that means it is special
            my ($keyword, $rest) = ($1, $2);
            $out = "";
            
            # cut off initial spaces
            $rest =~ s/^\s+//;
            
            if ( $keyword eq q(\\") ) {
                # this is a comment, skip this line
            }
            elsif ( $keyword eq "TH" ) {
                # man page header:
                # curl 1 "22 Oct 2003" "Curl 7.10.8" "Curl Manual"

		# Treat pages that have "*(Dt":
		# .TH IDENT 1 \*(Dt GNU

		$rest =~ s,\Q\\*(Dt,,g;

		# Delete backslashes

		$rest =~ s,\\,,g;

		# Delete old RCS tags
		# .TH saidar 1 $Date:\ 2006/11/30\ 23:42:42\ $ i\-scream

		$rest =~ s,\$Date:\s+(.*?)\s+\$,$1,g;

                # NAME SECTION DATE VERSION MANUAL
		# section can be: 1 or 3C

                if ( $rest =~ /(\S+)\s+\"?(\d+\S?)\"?\s+\"([^\"]*)\" \"([^\"]*)\"(\"([^\"]*)\")?/ ) {
                    # strict matching only so far
                    $manpage{'name'}    = $1;
                    $manpage{'section'} = $2;
                    $manpage{'date'}    = $3;
                    $manpage{'version'} = $4;
                    $manpage{'manual'}  = $6;
                }
	        # .TH html2text 1 2008-09-20 HH:MM:SS
		elsif ( $rest =~  m, (\S+) \s+ \"?(\d+\S?)\"? \s+ \"?([ \d:/-]+)\"? \s* (.*) ,x )
		{
                    $manpage{'name'}    = $1;
                    $manpage{'section'} = $2;
                    $manpage{'date'}    = $3;
                    $manpage{'manual'}  = $4;
		}
		# .TH program 1 description
		elsif ( $rest =~ /(\S+) \s+ \"?(\d+\S?)\"? \s+ (.+)/x )
		{
                    $manpage{'name'}    = $1;
                    $manpage{'section'} = $2;
                    $manpage{'manual'}  = $3;
		}
		# .TH program 1
		elsif ( $rest =~ /(\S+) \s+ \"?(\d+\S?)\"? /x )
		{
                    $manpage{'name'}    = $1;
                    $manpage{'section'} = $2;
		}
            }
            elsif($keyword =~ /^S[HS]$/) {
                # SS is treated the same as SH
                # Section Header
                showp(@p);
                @p="";
                if($pre) {
                    push @out, "</pre>\n";
                    $pre = 0;
                }

                my $name = text2name($rest);
                $anchor{$name}=1;

                $rest =~ s/\"//g; # cut off quotes
                $rest =~ s/</&lt;/g;
                $rest =~ s/>/&gt;/g;
                $out = "<a name=\"$name\"></a><h2 class=\"nroffsh\">$rest</h2>";
                $indentlevel=0;
                $within_tp=0;
            }
            elsif(($keyword =~ /^B$/) || ($keyword =~ /^BI$/)) {
                # Make B and BI the same for simplicity
                $rest =~ s/\"//g; # cut off quotes
                $rest =~ s/</&lt;/g;
                $rest =~ s/>/&gt;/g;

                # This is pretty lame, but since a .B section and itself
                # contain a \fI[blabla]\fP section, we cut off all the \f
                # occurances within the .B text to make it easier for ourself.
                $rest =~ s/\\f[IPR]//g;

                push @p, "<span class=\"bold\">$rest</span> ";
                if($pre) {
                    push @p, "\n";
                }
            }
            elsif(($keyword =~ /^I$/) || ($keyword =~ /^IR$/)) {
                $rest =~ s/\"//g; # cut off quotes
                $rest =~ s/</&lt;/g;
                $rest =~ s/>/&gt;/g;
                push @p, "<span class=\"emphasis\">$rest</span> ";
            }
            elsif($keyword =~ /^RS$/) {
                # the start of another indent-level. for inlined tables
                # within an "IP"
                showp(@p);
                @p="";
                $indentlevel++;
            }
            elsif($keyword =~ /^RE$/) {
                # end of the RS section
                showp(@p);
                @p="";
                $indentlevel--;
            }
            elsif($keyword =~ /^NF$/) {
                # We let nf start a <pre> section .nf = no-fill, you use for
                # graphs or text that you don't want spaces to be ignored

                showp(@p);
                @p="";
                push @out, "<pre>\n";
                $pre=1
            }
            elsif($keyword =~ /^TP$/) {
                # Used within an "RS" section to make a new line. The first
                # TP as a column indicator, but we decide to do that
                # controlling in the CSS instead.
                $within_tp=1;
                showp(@p);
                @p="";                
            }
            elsif($keyword =~ /^IP$/) {
                # start of a new paragraph coming up
                showp(@p);
                @p="";

                my $name= text2name($rest);
                $anchor{$name}=1;

                $rest =~ s/\"//g; # cut off quotes
                $rest =~ s/</&lt;/g;
                $rest =~ s/>/&gt;/g;
                
                $indentlevel-- if ($indentlevel);
                push @p, "<a name=\"$name\"></a><span class=\"nroffip\">$rest</span> ";
                # make this a single-line title
                showp(@p);
                @p="";
                $indentlevel++;
                $within_tp=0;
            }
            elsif($keyword =~ /^ad$/) {
                showp(@p);
                @p="";
            }
            elsif($keyword =~ /^sp$/) {
                showp(@p);
                @p="";
            }
            elsif($keyword =~ /^lp$/) {
                # marks end of a paragraph
                showp(@p);
                @p="";
            }
            elsif($keyword =~ /^pp$/) {
                # PP ends a TP section, but some TP sections don't use it
                # Often used to separate paragraphs
                $within_tp=0;
                showp(@p);
                @p="";
            }
            elsif($keyword =~ /^fi$/) {
                # .fi = fill-in, extra space will be ignored and text that is
                # entered like this, the fill-in command will continue until
                # you enter a .nf command and vice-versa

                showp(@p);
                @p="";
                if($pre) {
                    # if this is the end of a .nf (<pre>) section
                    push @out, "</pre>\n";
                    $pre=0; # disable pre again
                }

            }
            elsif($keyword =~ /^(de|ft|\.)$/) {
                # no idea what these do!
            }
            elsif($keyword =~ /^so$/) {
                # This keyword refers to a different man page, named in the
                # $rest.
                # We don't support this
                push @out, "See the $rest man page.\n";
            }
            elsif($keyword =~ /^BR$/) {
                # I'm not sure what this does exactly, but this is commonly
                # used to include pointers to other man pages. Let's assume
                # it only does that for now.
                # blabla (3)
                # or "blabla (3)"
                # or strcmp "(3), " strcasecmp "(3)"
                # etc
                
                $rest =~ s/\"//g; # cut off quotes
                $rest =~ s/</&lt;/g;
                $rest =~ s/>/&gt;/g;

                if(!$rest) {
                    # A stand-alone .br will become a line break
                    #print $debugFH "ALERT: bare BR\n";
                    push @p, "<br>";
                }
                else {
                    my @all = split /,/, $rest;
                    for(@all) {
                        if(/([^ ]*) *\((\d+)\)/) {
                            # TODO: this looks like a man page, check if
                            # there's a HTML file for it and if so make a link
                            # to it
                        }
                        
                        push @p, "<span class=\"manpage\">$_</span> ";
                    }
                }
            }
            else {
                showp(@p);
                @p="";
                print $debugFH "ALERT: unknown keyword \"$keyword\" on line $lineno\n";
            }
        }
        else {
            # text line, decode \-stuff
            my $txt = $in;

            $txt =~ s/</&lt\;/g;
            $txt =~ s/>/&gt\;/g;

	    $txt = handle_italic_bold $txt;

            $txt =~ s/\\&//g;
            $txt =~ s/\\-/-/g;
            $txt =~ s/\\ /&nbsp\;/g;
            $txt =~ s/\\\'/&acute\;/g;
            $txt =~ s/\\\(co/&copy\;/g;

            # replace backslash [something] with just [something]
            $txt =~ s/\\(.)/$1/g;

            if($txt =~ /^[ \t\r\n]*$/) {
                # no contents, marks end of a paragraph
                showp(@p);
                @p="";
            }
            else {
                $txt =~ s/^ /\&nbsp\;/g;
                push @p, "$txt ";
            }
            $out ="";
        }

        if($out) {
            push @out, $out;
   #         print $debugFH "DEBUG OUT: $out\n";
        }
        else {
   #         print $debugFH "DEBUG OUT: [withheld]\n";
        }
    }
    showp(@p);
}

parsefile();

my @conv = linkfile();

my $title=sprintf("%s man page",
                  $manpage{'name'}?$manpage{'name'}:"secret");

if($standalone) {
    print $OutFH <<MOO
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
 "http://www.w3.org/TR/html4/loose.dtd">
<html><head>
<title>$title</title>
<meta name="generator" content="roffit">
MOO
    ;
    defaultcss();
    print "</head><body>\n";
}
else {
    print "<!-- generated with roffit -->\n";
}

print $OutFH @conv;
print $OutFH <<ROFFIT
<p class="roffit">
 This HTML page was made with <a href="http://daniel.haxx.se/projects/roffit/">roffit</a>.
ROFFIT
    ;

if($standalone) {
    print "</body></html>\n";
}
