eb/doc/html-split
2016-10-28 20:02:30 -07:00

239 lines
5.9 KiB
Perl
Executable File
Raw Blame History

#! /usr/bin/perl
#
# Copyright (c) 2005-2006 Motoyuki Kasahara
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# 3. Neither the name of the project nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#
#
# html-split -- split an HTML file.
#
# Usage:
# html-split [option...] input-file
#
# `html-split' splits an HTML file with heading tags (<h1>...<h6>).
# Suppose that `input-file' is `foo.html', HTML files splitted by
# `html-split' are `foo-0.html', `foo-1.html', and so on.
#
# Options:
# -Z do not add `-0' to the first split.
# -l LEVEL split with this heading level
# (default: h2)
# -p PREFIX prefix of splitted HTML files.
# -s SUFFIX suffix of splitted HTML files.
# (default: html)
# -w WIDTH minimum width of split number.
# (default: 1)
# -t TOC fragment name of `Table of Contents'.
# (default: toc)
#
require 5.005;
use Getopt::Std;
use File::Basename;
#
# Usage
#
my $usage = "Usage: $0 [option...] input-file\n";
#
# Variables
#
my $in_file;
my $out_prefix;
my $out_suffix;
my $counter_width = 1;
my $split_level = 2;
my $toc_tag = 'toc';
my $supress_zero_flag = 0;
my @toc = ();
my @indice = ();
my @preamble = ();
my $toc_page = 0;
#
# Parse command line arguments.
#
my %options;
getopts('Zl:w:p:s:t:', \%options) or die $usage;
die $usage if (@ARGV != 1);
$in_file = $ARGV[0];
if (defined($options{l})) {
$options{l} =~ s/^h//;
$split_level = $options{l};
}
$counter_width = $options{w} if (defined($options{w}));
$supress_zero_flag = defined($options{Z});
$toc_tag = $options{t} if (defined($options{t}));
if (defined($options{p})) {
$out_prefix = $options{p};
} else {
$out_prefix = basename($in_file, '.htm', '.html');
}
if (defined($options{s})) {
$out_suffix = $options{s};
} elsif ($in_file =~ m|\.htm$|) {
$out_suffix = 'htm';
} else {
$out_suffix = 'html';
}
#
# Read an HTML file.
#
if (!open(IN_FILE, "< $in_file")) {
die "$0: failed to open the file, $!: $in_file\n";
}
my $toc_found = 0;
my $page = 0;
while (<IN_FILE>) {
last if (m|^<body>|);
push(@preamble, $_);
}
while (<IN_FILE>) {
chomp;
last if (m|^</body>|);
if (m|^<h([1-6])>| && $1 <= $split_level) {
$page++ if (@toc > 0);
push(@toc, $_);
}
if (m|<a name=\"([^\"]+)\">|) {
my $tag = $1;
if ($tag eq $toc_tag) {
$toc_page = $page;
$toc_found = 1;
}
push(@indice, {'tag' => $tag, 'page' => $page});
}
}
close(IN_FILE);
if (!$toc_found) {
die "$0: <a name=\"$toc_tag\"> not found\n";
}
#
# Generate splitted HTML files.
#
if (!open(IN_FILE, "< $in_file")) {
die "$0: failed to open the file, $!: $in_file\n";
}
while (<IN_FILE>) {
last if (m|^<body>|);
}
for (my $page = 0; $page < @toc; $page++) {
my $bar = '';
if ($page > 0) {
$bar .= sprintf("[<a href=\"%s\"><3E><><EFBFBD><EFBFBD></a>] ",
splitted_file_name($page - 1));
}
if ($page + 1 < @toc) {
$bar .= sprintf("[<a href=\"%s\"><3E><><EFBFBD><EFBFBD></a>] ",
splitted_file_name($page + 1));
}
$bar .= sprintf("[<a href=\"%s\#%s\"><3E>ܼ<EFBFBD></a>] ",
splitted_file_name($toc_page), $toc_tag);
my $out_file = splitted_file_name($page);
if (!open(OUT_FILE, "> $out_file")) {
die "$0: failed to open the file, $!: $out_file\n";
}
foreach my $j (@preamble) {
print OUT_FILE $j;
}
print OUT_FILE "<body>\n";
print OUT_FILE "<p>\n", $bar, "\n</p>\n<hr>\n";
print OUT_FILE $toc[$page], "\n";
for (;;) {
$_ = <IN_FILE>;
chomp;
if (!defined($_) || m|^</body>|) {
1 while (<IN_FILE>);
last;
}
elsif (m|^<h([1-6])>| && $1 <= $split_level) {
next if ($page == 0 && $_ eq $toc[$page]);
last;
}
1 while (s|<a href="\#([^\"]+)">|&rewrite_href($1)|e);
print OUT_FILE $_, "\n";
}
print OUT_FILE "<hr>\n<p>\n", $bar, "\n</p>\n";
print OUT_FILE "</body>\n";
print OUT_FILE "</html>\n";
close(OUT_FILE);
}
close(IN_FILE);
#
# Return n'th splitted file name.
#
sub splitted_file_name ($) {
my ($n) = @_;
if ($n == 0 && $supress_zero_flag) {
return sprintf("%s.%s", $out_prefix, $out_suffix);
} else {
return sprintf("%s-%0${counter_width}d.%s",
$out_prefix, $n, $out_suffix);
}
}
#
# Rewrite <a href="...">.
#
sub rewrite_href ($) {
my ($tag) = @_;
for (my $i = 0; $i < @indice; $i++) {
if ($indice[$i]->{tag} eq $tag) {
return sprintf("<a href=\"%s\#%s\">",
splitted_file_name($indice[$i]->{page}), $tag);
}
}
warn "$0: unknown tag \`$tag'\n";
return "<a href=\"$tag\">";
}