#!/usr/bin/perl

# Copyright © 2001 Colin Watson
# Copyright © 2008 Jordà Polo
# Copyright © 2017-2019 Chris Lamb <lamby@debian.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, you can find it on the World Wide
# Web at http://www.gnu.org/copyleft/gpl.html, or write to the Free
# Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
# MA 02110-1301, USA.

# You need copies of all the relevant manuals installed in the standard
# places locally (packages debian-policy, developers-reference, doc-base,
# python, lintian, menu, java-policy and vim-doc).

use v5.20;
use warnings;
use utf8;

use Const::Fast;
use Cwd qw(realpath);
use File::Basename qw(dirname);
use Path::Tiny;
use Unicode::UTF8 qw(decode_utf8 encode_utf8);

# neither Path::This nor lib::relative are in Debian
use constant THISFILE => realpath __FILE__;
use constant THISDIR => dirname realpath __FILE__;

# use Lintian modules that belong to this program
use lib THISDIR . '/../lib';

use List::SomeUtils qw(none);
use POSIX qw(strftime);

const my $EMPTY => q{};
const my $NEWLINE => qq{\n};
const my $ASTERISK => q{*};
const my $DOLLAR => q{$};
const my $DOUBLE_COLON => q{::};

# For each manual, we need:
#  * Location of the manual index on the local filesystem
#  * Base URL for the eventual target of the reference (or empty string if no
#    public URL is available)
#  * Regex to match the possible references
#  * Mapping from regex fields to reference fields (array of arrays of
#    keywords: url, section title; the position of each keyword in the array
#    defines which is its corresponding group in the regex)
#
# Optionally, if there are subsections that aren't available in the index, an
# additional regex can be defined to match possible references on other pages
# of the manual.

my $title_re = qr/<title\s?>(.+?)(?:\s+&#8212.+|\s+v[\d.]+)?<\/title\s?>/i;
my $link_re
  = qr/<link href="(.+?)" rel="[\w]+" title="([A-Z]|[A-Z]?[\d\.]+?)\.?\s+([\w\s[:punct:]]+?)">/;
my $index_re
  = qr/<a href="(.+?)">\s*([A-Z]|[A-Z]?[\d\.]+?)\.?\s+([\w\s[:punct:]]+?)\s*<\/a>/;
my $sphinx_re
  = qr/<a class="reference internal" href="(.+)">([\d\.]+)\.\s+(.*)<\/a>/;
my $page_fields = [['url'], ['section'], ['title']];
my $dbk_index_re
  = qr/([\d.]+?)\.\s+<a\s*href="(.+?)"\s*>([\w\s[:punct:]]+?)<\/a\s*>/i;
my $dbk_fields = [['section'], ['url'], ['title']];
my $dbk_index_re2
  = qr/<a\s*href="(.+?)"\s*>([\d.]+?)\.\s+([\w\s[:punct:]]+?)<\/a\s*>/i;

my %manuals = (
    'policy' => [
        '/usr/share/doc/debian-policy/policy.html/index.html',
        'https://www.debian.org/doc/debian-policy/',
        $sphinx_re, $page_fields
    ],
    'menu-policy' => [
        '/usr/share/doc/debian-policy/menu-policy.html/index.html',
        'https://www.debian.org/doc/packaging-manuals/menu-policy/',
        $index_re,
        $page_fields
    ],
    'perl-policy' => [
        '/usr/share/doc/debian-policy/perl-policy.html/index.html',
        'https://www.debian.org/doc/packaging-manuals/perl-policy/',
        $index_re,
        $page_fields
    ],
    'python-policy' => [
        '/usr/share/doc/python3/python-policy.html/index.html',
        'https://www.debian.org/doc/packaging-manuals/python-policy/',
        $index_re,
        $page_fields
    ],
    'java-policy' => [
        '/usr/share/doc/java-policy/debian-java-policy/index.html',
        'https://www.debian.org/doc/packaging-manuals/java-policy/',
        $dbk_index_re2,
        $page_fields
    ],
    'vim-policy' => [
        '/usr/share/doc/vim/vim-policy.html/index.html',
        'http://pkg-vim.alioth.debian.org/vim-policy.html/',
        $dbk_index_re,$dbk_fields
    ],
    'lintian' => [
        '/usr/share/doc/lintian/lintian.html',
        'https://lintian.debian.org/manual/',
        $dbk_index_re,
        $dbk_fields
    ],
    'devref' => [
        '/usr/share/doc/developers-reference/index.html',
        'https://www.debian.org/doc/developers-reference/',
        $index_re,
        $page_fields,
#<<< no perl tidy
# breaking this regex up with x should work, but for some reason it
# trips a false positive in the minimum-version (thinking it requires
# perl5.17 for some bizarre reason).
        qr{<h[45] class="title"><a id="(.+?)"></a>([\d\.]+?)\.? ([\w\s[:punct:]]+?)</h[45]>},
#>>>
    ],
    'menu' => [
        '/usr/share/doc/menu/html/index.html',
        'https://www.debian.org/doc/packaging-manuals/menu.html/',
        $index_re, $page_fields
    ],
    'doc-base' => [
        '/usr/share/doc/doc-base/doc-base.html/index.html',
        $EMPTY,$index_re, $page_fields
    ],
    'debconf-spec' => [
        '/usr/share/doc/debian-policy/debconf_specification.html',
        join(q{/},
            'https://www.debian.org',
            'doc/packaging-manuals/debconf_specification.html'),
        $index_re,
        $page_fields
    ],
    'fhs' => [
        '/usr/share/doc/debian-policy/fhs/fhs-3.0.html',
        'http://www.pathname.com/fhs/pub/fhs-3.0.html',
        qr/<a\s*href="(#.+?)"\s*>([\w\s[:punct:]]+?)<\/a\s*>/i,
        [['section', 'url'], ['title']]
    ],
);

# Check all of the manuals are present before trying anything
{
    my $ok = 1;
    foreach my $manual (sort keys %manuals) {
        my ($index, undef) = @{$manuals{$manual}};
        unless (-e $index) {
            print {*STDERR}
              encode_utf8(
                "Manual \"$manual\" not available (missing: $index)\n");
            $ok = 0;
        }
    }
    exit 1 unless $ok;
}

my $generated;
open(my $memory_fd, '>', \$generated)
  or die encode_utf8('Cannot open scalar');

for my $manual (sort keys %manuals) {
    my ($index, $url, $ref_re, $fields, $sub_re) = @{$manuals{$manual}};

    # Extract references from the index.
    my @subpages
      = extract_refs($memory_fd, $manual, 0, $index, $url, $ref_re, $fields);

    # Extract additional subsection references if not available in the index.
    next
      unless $sub_re;

    for my $pagename (@subpages) {
        my $page = dirname($index) . "/$pagename";
        extract_refs($memory_fd, $manual, 1, $page, $url, $sub_re, $fields);
    }
}

close $memory_fd;

my $date = strftime('%Y-%m-%d', gmtime);

my $header =<<"HEADER";
# Data about titles, sections, and URLs of manuals, used to expand references
# in tag descriptions and add links for HTML output.  Each line of this file
# has four fields separated by double colons:
#
#     <manual> :: <section> :: <title> :: <url>
#
# If <section> is empty, that line specifies the title and URL for the whole
# manual.  If <url> is empty, that manual is not available on the web.
#
# Last updated: $date

HEADER

my $output = encode_utf8($header) . $generated;
path('data/output/manual-references')->spew($output);

exit;

# extract_refs -- Extract manual references from HTML file.
#
# This function takes the output file handle, the path to the page, and the
# regex to match, and prints references to stdout. The second argument is used
# to decide whether to look for the title (0) or not (1). It returns a list of
# pages linked by the extracted references.
sub extract_refs {
    my ($data_fd, $manual, $title_done, $page, $url, $ref_re, $fields) = @_;

    open(my $page_fd, '<:utf8_strict', $page)
      or die encode_utf8("Cannot open $page");

    # Read until there are 2 newlines. This hack is needed since some lines in
    # the Developer's Reference are cut in the middle of <a>...</a>.
    local $/ = "\n\n";

    my %seen_sections;

    my @linked_pages = ();
    while (my $chunk = <$page_fd>) {

        if (!$title_done && $chunk =~ /$title_re/) {
            $title_done = 1;
            my $line = join($DOUBLE_COLON, $manual, $EMPTY, $1, $url);
            print {$data_fd} encode_utf8($line . $NEWLINE);
        }

        while ($chunk =~ /$ref_re/g) {

            my %ref;

            my $v = 1;
            for my $field (@{$fields}) {

                $ref{$_} = eval $DOLLAR . $v for @{$field};

            } continue {
                $v++;
            }

            if ($ref{url} =~ /^(.+?\.html)#?/i) {
                push(@linked_pages, $1)
                  if none { /$1/ } @linked_pages;
            }

            # If the extracted URL part doesn't look like a URL, assume it is
            # an anchor and convert to URL accordingly.
            $ref{url} = basename($page) . "#$ref{url}"
              if $ref{url} && $ref{url} !~ /(?:#|\.html$)/i;

            $ref{title} =~ s/\s+/ /g;
            $ref{title} =~ s{<span[^>]*>(.*?)</span ?>}{$1}ig;
            $ref{title} =~ s{<code[^>]*>(.*?)</code ?>}{<code>$1</code>}ig;

            $ref{url} = "$url$ref{url}";
            $ref{url} = $EMPTY if not $url;

            $ref{section} =~ s/^\#(.+)$/\L$1/;
            # Some manuals reuse section numbers for different references,
            # e.g. the Debian Policy's normal and appendix sections are
            # numbers that clash with each other. Track if we've already
            # seen a section pointing to some other URL than the current one,
            # and uniquify it by appending as many asterisks as necessary.
            $ref{section} .= $ASTERISK
              while (defined($seen_sections{$ref{section}})
                && $seen_sections{$ref{section}} ne $ref{url});
            $seen_sections{$ref{section}} = $ref{url};

            my $line= join($DOUBLE_COLON,
                $manual, $ref{section}, $ref{title}, $ref{url});
            print {$data_fd} encode_utf8($line . $NEWLINE);
        }
    }

    close($page_fd);

    return @linked_pages;
}

# Local Variables:
# indent-tabs-mode: nil
# cperl-indent-level: 4
# End:
# vim: syntax=perl sw=4 sts=4 sr et
