-
Notifications
You must be signed in to change notification settings - Fork 102
/
Copy pathlatexml
executable file
·366 lines (298 loc) · 13.7 KB
/
latexml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
#!/usr/bin/env perl
# /=====================================================================\ #
# | latexml | #
# | main conversion program | #
# |=====================================================================| #
# | Part of LaTeXML: | #
# | Public domain software, produced as part of work done by the | #
# | United States Government & not subject to copyright in the US. | #
# |---------------------------------------------------------------------| #
# | Bruce Miller <bruce.miller@nist.gov> #_# | #
# | http://dlmf.nist.gov/LaTeXML/ (o o) | #
# \=========================================================ooo==U==ooo=/ #
use strict;
use warnings;
use FindBin;
use lib "$FindBin::RealBin/../lib";
use Getopt::Long qw(:config no_ignore_case);
use Pod::Usage;
use LaTeXML::Core;
use LaTeXML; # Currently, just for version information.
use LaTeXML::Util::Pathname;
use LaTeXML::Common::Error;
use Encode;
#**********************************************************************
# Parse command line
my ($verbosity, $strict, $comments, $noparse, $includestyles) = (0, 0, 1, 0, 0);
my ($format, $destination, $help, $showversion) = ('xml', '');
my ($preamble, $postamble, $logfile) = (undef, undef, undef);
my ($documentid);
my $inputencoding;
my $mode = undef;
my @paths = ();
my (@preload);
GetOptions("destination=s" => \$destination,
"output=s" => \$destination,
"preload=s" => \@preload,
"path=s" => \@paths,
"preamble=s" => \$preamble,
"postamble=s" => \$postamble,
"quiet" => sub { $verbosity--; },
"verbose" => sub { $verbosity++; },
"strict" => \$strict,
"log=s" => \$logfile,
"xml" => sub { $format = 'xml'; },
"tex" => sub { $format = 'tex'; },
"box" => sub { $format = 'box'; },
"bibtex" => sub { $mode = 'BibTeX'; },
"noparse" => \$noparse,
"includestyles" => \$includestyles,
"inputencoding=s" => \$inputencoding,
"comments!" => \$comments,
"VERSION" => \$showversion,
"debug=s" => sub { no strict 'refs'; $LaTeXML::DEBUG{ lc($_[1]) } = 1; },
"documentid=s" => \$documentid,
"help" => \$help,
) or pod2usage(-message => $LaTeXML::IDENTITY,
-exitval => 1, -verbose => 0, -output => \*STDERR);
pod2usage(-message => $LaTeXML::IDENTITY, -exitval => 0, -verbose => 2, -output => \*STDOUT)
if $help;
if ($showversion) { print STDERR "$LaTeXML::IDENTITY\n"; exit(0); }
my $source = shift(@ARGV);
#======================================================================
# Validate command line before any actual processing
SetVerbosity($verbosity);
UseSTDERR();
if (my @fails = validate_args()) {
print STDERR join("\n", $LaTeXML::IDENTITY, (map { colorizeString($_, 'error'); } @fails),
"use option --help for details", "Conversion failed", '');
exit(1); }
#======================================================================
my $starttime = StartTime();
my $latexml;
my $digested;
my $serialized;
eval { # Catch errors
UseLog($logfile);
Note("$LaTeXML::IDENTITY processing $source");
$latexml = LaTeXML::Core->new(
preload => [@preload],
searchpaths => [@paths],
graphicspaths => ['.'],
verbosity => $verbosity, strict => $strict,
includecomments => $comments, inputencoding => $inputencoding,
includestyles => $includestyles,
documentid => $documentid,
nomathparse => $noparse);
# Now do all processing relative to that State, to capture all erros
$latexml->withState(sub {
# ========================================
# First read and digest whatever we're given.
$digested = $latexml->digestFile($source, mode => $mode,
preamble => $preamble, postamble => $postamble);
# ========================================
# Now, convert to DOM, serialize, encode and output, as appropriate.
if ($digested) {
if ($format eq 'tex') {
$serialized = LaTeXML::Core::Token::UnTeX($digested); }
elsif ($format eq 'box') {
$serialized = ($verbosity > 0 ? $digested->stringify : $digested->toString); }
else {
my $dom = $latexml->convertDocument($digested);
$serialized = $dom->toString(1); }
# NOTE that we are serializing via LaTeXML's Document::serialize_aux
# which has NOT been encoded into bytes, so we need an explicit encode before printing/returning
$serialized = Encode::encode('UTF-8', $serialized) if $serialized;
}
$latexml->showProfile(); # Show profile (if any)
}); };
#======================================================================
my $exit_message = $@;
Debug($@) if $@ ne $LaTeXML::Common::Error::DIE_MESSAGE;
my $code = $latexml && $latexml->getStatusCode;
my $status = $latexml && $latexml->getStatusMessage;
my $runtime = RunTime($starttime);
Note("Conversion " . ($code == 3 ? 'failed' : 'complete') . ": " . $status . " (reqd. $runtime)");
UseLog(undef);
# Should this be within the withState ?
# Hopefully, no errors by now... but if so...
# Does having the status line AFTER the result help or hurt????
if (!$serialized) { }
elsif ($destination) {
my $OUTPUT;
open($OUTPUT, ">", $destination) or die "Couldn't open output file $destination: $!";
print $OUTPUT $serialized;
close($OUTPUT); }
else {
print $serialized; }
# ========================================
# Now, unbind stuff, so we can clear memory
$latexml = undef;
$digested = undef;
$serialized = undef;
CheckDebuggable();
UseSTDERR(undef);
if ($exit_message) { # non-zero exit code for fatal conversions
exit(1);
}
#======================================================================
# Helper code.
# This validates arguments as much as possible BEFORE invoking any LaTeXML components
# (and WITHOUT invoking any LaTeXML Error handling)
sub validate_args {
my @fails = ();
push(@fails, "Unrecognized trailing arguments " . join(',', @ARGV)) if @ARGV;
# Check search paths
@paths = map { pathname_canonical($_) } reverse(@paths);
if (my @baddirs = grep { !-d $_ } @paths) {
push(@fails, "These search path do not exist: " . join(', ', @baddirs)); }
# Find the requested source
my $pathname;
if (!$source) {
push(@fails, "No input file given"); }
elsif ($source eq '-') {
{ local $/ = undef;
$source = "literal:" . <>;
$mode = 'TeX' unless defined $mode; } } # Or sniff for bibtex data?
elsif (pathname_is_literaldata($source)) {
}
elsif (!($pathname = pathname_find($source, types => ['tex', 'bib', ''], paths => ['.', @paths]))
|| !-r $pathname) {
push(@fails, "Input file '$source' not readable"); }
else {
$mode = 'BibTeX' if !defined $mode && ($pathname =~ /\.bib$/);
$mode = 'TeX' unless defined $mode; }
# Always create a .log file for tracking the conversion messages
# Use $jobname.latexml.log or just latexml.log for logging by default
if ($pathname && !$logfile) {
my ($dir, $name, $ext) = pathname_split($pathname);
if ($name) {
$logfile = pathname_make(dir => pathname_cwd(), name => $name, type => 'latexml.log'); } }
$logfile = 'latexml.log' unless $logfile;
# Check that destination is valid before wasting any time...
if ($destination) {
$destination = pathname_canonical($destination);
if (my $dir = pathname_directory($destination)) {
if (!(pathname_mkdir($dir) && -w $dir)) {
push(@fails, "Couldn't create writable destination directory $dir: $!"); } } }
return @fails; }
#**********************************************************************
__END__
=head1 NAME
C<latexml> - transforms a TeX/LaTeX file into XML.
=head1 SYNOPSIS
latexml [options] I<texfile>
Options:
--destination=file sets destination file (default stdout).
--output=file [obsolete synonym for --destination]
--preload=module requests loading of an optional module;
can be repeated
--preamble=file sets a preamble file which will
effectively be prepended to the main file.
--postamble=file sets a postamble file which will
effectively be appended to the main file.
--includestyles allows latexml to load raw *.sty file;
by default it avoids this.
--path=dir adds to the paths searched for files,
modules, etc;
--log=file specifies log file (default is file named after job name)
--documentid=id assign an id to the document root.
--quiet suppress messages (can repeat)
--verbose more informative output (can repeat)
--strict makes latexml less forgiving of errors
--bibtex processes as a BibTeX bibliography.
--xml requests xml output (default).
--tex requests TeX output after expansion.
--box requests box output after expansion
and digestion.
--noparse suppresses parsing math
--nocomments omit comments from the output
--inputencoding=enc specify the input encoding.
--VERSION show version number.
--debug=package enables debugging output for the named
package
--help shows this help message.
If I<texfile> is '-', latexml reads the TeX source from standard input.
If I<texfile> has an explicit extension of C<.bib>, it is processed
as a BibTeX bibliography.
=head1 OPTIONS AND ARGUMENTS
=over 4
=item C<--destination>=I<file>
Specifies the destination file; by default the XML is written to stdout.
=item C<--preload>=I<module>
Requests the loading of an optional module or package. This may be useful if the TeX code
does not specifically require the module (eg. through input or usepackage).
For example, use C<--preload=LaTeX.pool> to force LaTeX mode.
=item C<--preamble>=I<file>, C<--postamble>=I<file>
Specifies a file whose contents will effectively be prepended or appended
to the main document file's content. This can be useful when processing
TeX fragments, in which case the preamble would contain documentclass and begindocument
control sequences. This option is not used when processing BibTeX files.
=item C<--includestyles>
This optional allows processing of style files (files with extensions C<sty>,
C<cls>, C<clo>, C<cnf>). By default, these files are ignored unless a latexml
implementation of them is found (with an extension of C<ltxml>).
These style files generally fall into two classes: Those
that merely affect document style are ignorable in the XML.
Others define new markup and document structure, often using
deeper LaTeX macros to achieve their ends. Although the omission
will lead to other errors (missing macro definitions), it is
unlikely that processing the TeX code in the style file will
lead to a correct document.
=item C<--path>=I<dir>
Add I<dir> to the search paths used when searching for files, modules, style files, etc;
somewhat like TEXINPUTS. This option can be repeated.
=item C<--documentid>=I<id>
Assigns an ID to the root element of the XML document. This ID is generally
inherited as the prefix of ID's on all other elements within the document.
This is useful when constructing a site of multiple documents so that
all nodes have unique IDs.
=item C<--quiet>
Reduces the verbosity of output during processing, used twice is pretty silent.
=item C<--verbose>
Increases the verbosity of output during processing, used twice is pretty chatty.
Can be useful for getting more details when errors occur.
=item C<--strict>
Specifies a strict processing mode. By default, undefined control sequences and
invalid document constructs (that violate the DTD) give warning messages, but attempt
to continue processing. Using --strict makes them generate fatal errors.
=item C<--bibtex>
Forces latexml to treat the file as a BibTeX bibliography.
Note that the timing is slightly different than the usual
case with BibTeX and LaTeX. In the latter case, BibTeX simply
selects and formats a subset of the bibliographic entries; the
actual TeX expansion is carried out when the result is included
in a LaTeX document. In contrast, latexml processes and expands
the entire bibliography; the selection of entries is done
during postprocessing. This also means that any packages
that define macros used in the bibliography must be
specified using the C<--preload> option.
=item C<--xml>
Requests XML output; this is the default.
=item C<--tex>
Requests TeX output for debugging purposes; processing is only carried out through expansion and digestion.
This may not be quite valid TeX, since Unicode may be introduced.
=item C<--box>
Requests Box output for debugging purposes; processing is carried out through expansion and digestions,
and the result is printed.
=item C<--nocomments>
Normally latexml preserves comments from the source file, and adds a comment every 25 lines as
an aid in tracking the source. The option --nocomments discards such comments.
=item C<--inputencoding=>I<encoding>
Specify the input encoding, eg. C<--inputencoding=iso-8859-1>.
The encoding must be one known to Perl's Encode package.
Note that this only enables the translation of the input bytes to
UTF-8 used internally by LaTeXML, but does not affect catcodes.
It is usually better to use LaTeX's inputenc package.
Note that this does not affect the output encoding, which is
always UTF-8.
=item C<--VERSION>
Shows the version number of the LaTeXML package..
=item C<--debug>=I<package>
Enables debugging output for the named package. The package is given without the leading LaTeXML::.
=item C<--help>
Shows this help message.
=back
=head1 SEE ALSO
L<latexmlpost>, L<latexmlmath>, L<LaTeXML>
=cut