forked from wingjay/SJTUThesis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
texcount.pl
executable file
·3809 lines (3458 loc) · 127 KB
/
texcount.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#! /usr/bin/env perl
#TeXcount is a Perl script that counts the number of words in the
#text of LaTeX files. It has rules for handling most of the common
#macros and provides colour coded output indicating which parts have
#been counted. Go to
# http://app.uio.no/ifi/texcount/
#for more information or to access the script online as a web service.
#
#The package, i.e. the script and all accompanying files, is
#distributed under the LaTeX Project Public License.
use strict;
use warnings;
use utf8; # Because the script itself is UTF-8 encoded
use Encode;
use Text::Wrap;
use Term::ANSIColor;
BEGIN {
if ($^O=~/^MSWin/) {
require Win32::Console::ANSI;
Win32::Console::ANSI::->import();
}
}
##### Version information
my $versionnumber="3.0";
my $versiondate="2013 Jul 29";
###### Set global settings and variables
### Global data about TeXcount
my %GLOBALDATA=
('versionnumber' => $versionnumber
,'versiondate' => $versiondate
,'maintainer' => 'Einar Andreas Rodland'
,'copyrightyears' => '2008-2013'
,'website' => 'http://app.uio.no/ifi/texcount/'
);
### Options and states
# Outer object (for error reports not added by a TeX object)
my $Main=getMain();
# Global options and settings
my $htmlstyle=0; # Flag to print HTML
my $texcodeoutput=0; # Flag to convert output to valid TeX text
my $encoding=undef; # Selected input encoding (default will be guess)
my @encodingGuessOrder=qw/ascii utf8 latin1/; # Encoding guessing order
my $outputEncoding; # Encoding used for output
my @AlphabetScripts=qw/Digit Is_alphabetic/; # Letters minus logograms: defined later
my @LogogramScripts=qw/Ideographic Katakana Hiragana Thai Lao Hangul/; # Scripts counted as whole words
# Parsing rules options
my $includeTeX=0; # Flag to parse included files
my $includeBibliography=0; # Flag to include bibliography
my %substitutions; # Substitutions to make globally
my %IncludedPackages; # List of included packages
# Counting options
my @sumweights; # Set count weights for computing sum
my $optionWordFreq=0; # Count words of this frequency, or don't count if 0
my $optionWordClassFreq=0; # Count words per word class (language) if set
my $optionMacroStat=0; # Count macro, environment and package usage
# Parsing details options
my $strictness=0; # Flag to check for undefined environments
my $defaultVerbosity='0'; # Specification of default verbose output style
my $defaultprintlevel=0; # Flag indicating default level of verbose output
my $printlevel=undef; # Flag indicating level of verbose output
my $showstates=0; # Flag to show internal state in verbose log
my $showcodes=1; # Flag to show overview of colour codes (2=force)
my $showsubcounts=0; # Write subcounts if #>this, or not (if 0)
my $separatorstyleregex='^word'; # Styles (regex) after which separator should be added
my $separator=''; # Separator to add after words/tokens
# Final summary output options
my $showVersion=0; # Indicator that version info be included (1) or not (-1)
my $totalflag=0; # Flag to write only total summary
my $briefsum=0; # Flag to set brief summary
my $outputtemplate; # Output template
my $finalLineBreak=1; # Add line break at end
# Global settings
my $optionFast=1; # Flag inticating fast method
# Global variables and internal states (for internal use only)
my $blankline=0; # Number of blank lines printed
my $errorcount=0; # Number of errors in parsing
my %warnings=(); # Warnings
my %WordFreq; # Hash for counting words
my %MacroUsage; # Hash for counting macros, environments and packages
# External sources
my $HTMLfile; # HTML file to use as HTML output template
my $CSSfile; # CSS file to use with HTML output
my $CSShref; # CSS reference to use with HTML output
my @htmlhead; # Lines to add to the HTML header
my $htmlopen; # text used to open the HTML file
my $htmlclose; # text used to close the HTML file
# String data storage
my $STRINGDATA;
# Other constants
my $_PARAM_='<param>'; # to identify parameter to _parse_unit
my $STRING_PARAMETER='{_}'; # used to log macro parameters
my $STRING_OPTIONAL_PARAM='[_]'; # used to log optional parameter
my $STRING_GOBBLED_OPTION='[]'; # used to log gobbled macro option
my $STRING_ERROR='<error>'; # used to log errors causing parsing to stop
my $REGEX_NUMBER=qr/^\d+$/; # regex to recognize a number
###### Set CMD specific settings and variables
## Preset command line options
# List of options (stings) separated by comma,
# e.g. ('-inc','-v') to parse included files and
# give verbose output by default.
my @StartupOptions=();
# CMD specific global variables
my @filelist; # List of files to parse
my $globalworkdir='./'; # Overrules workdir (default=present root)
my $workdir; # Current directory (default=taken from filename)
my $auxdir; # Directory for auxilary files, e.g. bbl (default=$workdir)
my $fileFromSTDIN=0; # Flag to set input from STDIN
my $_STDIN_='<STDIN>'; # File name to represent STDIN (must be '<...>'!)
# CMD specific settings
$Text::Wrap::columns=76; # Page width for wrapped output
###### Set state identifiers and methods
### Counter indices from 0 to $SIZE_CNT-1
# 0: Number of files
# 1: Text words
# 2: Header words
# 3: Caption words
# 4: Number of headers
# 5: Number of floating environments
# 6: Number of inlined math
# 7: Number of displayed math
my $SIZE_CNT=8;
my $SIZE_CNT_DEFAULT=8;
my $CNT_FILE=0;
my $CNT_WORDS_TEXT=1;
my $CNT_WORDS_HEADER=2;
my $CNT_WORDS_OTHER=3;
my $CNT_COUNT_HEADER=4;
my $CNT_COUNT_FLOAT=5;
my $CNT_COUNT_INLINEMATH=6;
my $CNT_COUNT_DISPLAYMATH=7;
# Labels used to describe the counts
my @countkey=('file','word','hword','oword','header','float','inmath','dsmath');
my @countdesc=('Files','Words in text','Words in headers',
'Words outside text (captions, etc.)','Number of headers','Number of floats/tables/figures',
'Number of math inlines','Number of math displayed');
# Map keywords to counters
my %key2cnt;
add_keys_to_hash(\%key2cnt,$CNT_FILE,0,'file');
add_keys_to_hash(\%key2cnt,$CNT_WORDS_TEXT,1,'text','word','w','wd');
add_keys_to_hash(\%key2cnt,$CNT_WORDS_HEADER,2,'headerword','hword','hw','hwd');
add_keys_to_hash(\%key2cnt,$CNT_WORDS_OTHER,3,'otherword','other','oword','ow','owd');
add_keys_to_hash(\%key2cnt,$CNT_COUNT_HEADER,4,'header','heading','head');
add_keys_to_hash(\%key2cnt,$CNT_COUNT_FLOAT,5,'float','table','figure');
add_keys_to_hash(\%key2cnt,$CNT_COUNT_INLINEMATH,6,'inline','inlinemath','imath','eq');
add_keys_to_hash(\%key2cnt,$CNT_COUNT_DISPLAYMATH,7,'displaymath','dsmath','dmath','ds');
### Token types
# Set in $tex->{'type'} by call to _next_token
my $TOKEN_SPACE=-1;
my $TOKEN_COMMENT=0;
my $TOKEN_WORD=1; # word (or other form of text or text component)
my $TOKEN_SYMBOL=2; # symbol (not word, e.g. punctuation)
my $TOKEN_MACRO=3; # macro (\name)
my $TOKEN_BRACE=4; # curly braces: { }
my $TOKEN_BRACKET=5; # brackets: [ ]
my $TOKEN_MATH=6;
my $TOKEN_LINEBREAK=9; # line break in file
my $TOKEN_TC=666; # TeXcount instructions (%TC:instr)
my $TOKEN_END=999; # end of line or blank line
### Parsing states
#
## States for regions that should not be counted
# IGNORE = exclude from count
# FLOAT = float (exclude, but include captions)
# EXCLUDE_STRONG = strong exclude, ignore environments
# EXCLUDE_STRONGER = stronger exclude, do not parse macro parameters
# EXCLUDE_ALL = ignore everything except end marker: even {
# PREAMBLE = preamble (between \documentclass and \begin{document})
## States for regions in which words should be counted
# TEXT = text
# TEXT_HEADER = header text
# TEXT_FLOAT = float text
## State change: not used in parsing, but to switch state then ignore contents
# TO_INLINEMATH = switch to inlined math
# TO_DISPLAYMATH = switch to displayed math
## Other states
# _NULL = default state to use if none other is defined
# _OPTION = state used to indicate that the next parameter is an option
# _EXCLUDE_ = cutoff, state <= this represents excluded text
## NB: Presently, it is assumed that additional states is added as 8,9,...,
## e.g. that states added through TC:newcounter correspond to the added counters.
#
my $STATE_IGNORE=-1;
my $STATE_MATH=-2;
my $STATE_FLOAT=-10;
my $STATE_EXCLUDE_STRONG=-20;
my $STATE_EXCLUDE_STRONGER=-30;
my $STATE_EXCLUDE_ALL=-40;
my $STATE_PREAMBLE=-99;
my $STATE_TEXT=1;
my $STATE_TEXT_HEADER=2;
my $STATE_TEXT_FLOAT=3;
my $STATE_TO_HEADER=4;
my $STATE_TO_FLOAT=5;
my $STATE_TO_INLINEMATH=6;
my $STATE_TO_DISPLAYMATH=7;
my $__STATE_EXCLUDE_=-10;
my $__STATE_NULL=1;
my $_STATE_OPTION=-1000;
my $_STATE_NOOPTION=-1001;
my $_STATE_AUTOOPTION=-1002;
# Counter key mapped to STATE
my $PREFIX_PARAM_OPTION=' '; # Prefix for parameter options/modifiers
my %key2state;
add_keys_to_hash(\%key2state,$STATE_TEXT,1,'text','word','w','wd');
add_keys_to_hash(\%key2state,$STATE_TEXT_HEADER,2,'headertext','headerword','hword','hw','hwd');
add_keys_to_hash(\%key2state,$STATE_TEXT_FLOAT,3,'otherword','other','oword','ow','owd');
add_keys_to_hash(\%key2state,$STATE_TO_HEADER,4,'header','heading','head');
add_keys_to_hash(\%key2state,$STATE_TO_FLOAT,5,'float','table','figure');
add_keys_to_hash(\%key2state,$STATE_TO_INLINEMATH,6,'inline','inlinemath','imath','eq');
add_keys_to_hash(\%key2state,$STATE_TO_DISPLAYMATH,7,'displaymath','dsmath','dmath','ds');
add_keys_to_hash(\%key2state,$STATE_IGNORE,0,'ignore','x');
add_keys_to_hash(\%key2state,$STATE_MATH,'ismath');
add_keys_to_hash(\%key2state,$STATE_FLOAT,-1,'isfloat');
add_keys_to_hash(\%key2state,$STATE_EXCLUDE_STRONG,-2,'xx');
add_keys_to_hash(\%key2state,$STATE_EXCLUDE_STRONGER,-3,'xxx');
add_keys_to_hash(\%key2state,$STATE_EXCLUDE_ALL,-4,'xall');
add_keys_to_hash(\%key2state,$_STATE_OPTION,'[',' option',' opt',' optional');
add_keys_to_hash(\%key2state,$_STATE_NOOPTION,'nooption','nooptions','noopt','noopts');
add_keys_to_hash(\%key2state,$_STATE_AUTOOPTION,'autooption','autooptions','autoopt','autoopts');
# When combining two states, use the first one; list must be complete!
my @STATE_FIRST_PRIORITY=(
$STATE_EXCLUDE_ALL,
$STATE_EXCLUDE_STRONGER,
$STATE_EXCLUDE_STRONG,
$STATE_FLOAT,
$STATE_MATH,
$STATE_IGNORE,
$STATE_PREAMBLE,
$STATE_TO_FLOAT,
$STATE_TO_HEADER,
$STATE_TO_INLINEMATH,
$STATE_TO_DISPLAYMATH);
my @STATE_MID_PRIORITY=();
my @STATE_LAST_PRIORITY=(
$STATE_TEXT_FLOAT,
$STATE_TEXT_HEADER,
$STATE_TEXT);
# Map state to corresponding word counter
my %state2cnt=(
$STATE_TEXT => $CNT_WORDS_TEXT,
$STATE_TEXT_HEADER => $CNT_WORDS_HEADER,
$STATE_TEXT_FLOAT => $CNT_WORDS_OTHER);
# Transition state mapped to content state and counter
my %transition2state=(
$STATE_TO_HEADER => [$STATE_TEXT_HEADER,$CNT_COUNT_HEADER],
$STATE_TO_INLINEMATH => [$STATE_MATH ,$CNT_COUNT_INLINEMATH],
$STATE_TO_DISPLAYMATH => [$STATE_MATH ,$CNT_COUNT_DISPLAYMATH],
$STATE_TO_FLOAT => [$STATE_FLOAT ,$CNT_COUNT_FLOAT]);
# Parsing state descriptions (used for macro rule help)
my %state2desc=(
$STATE_IGNORE => 'ignore: do not count',
$STATE_MATH => 'math/equation contents',
$STATE_FLOAT => 'float (figure, etc.): ignore all but special macros',
$STATE_EXCLUDE_STRONG => 'strong exclude: ignore environments',
$STATE_EXCLUDE_STRONGER => 'stronger exclude: ignore environments and macro paramters',
$STATE_EXCLUDE_ALL => 'exlude all: even {, only scan for end marker',
$STATE_PREAMBLE => 'preamble: from \documentclass to \begin{document}',
$STATE_TEXT => 'text: count words',
$STATE_TEXT_HEADER => 'header text: count words as header words',
$STATE_TEXT_FLOAT => 'float text: count words as float words (e.g. captions)',
$STATE_TO_HEADER => 'header: count header, then count words as header words',
$STATE_TO_FLOAT => 'float: count float, then count words as float/other words',
$STATE_TO_INLINEMATH => 'inline math: count as inline math/equation',
$STATE_TO_DISPLAYMATH => 'displayed math: count as displayed math/equation');
# Parsing state presentation style
my %state2style=(
$STATE_TEXT => 'word',
$STATE_TEXT_HEADER => 'hword',
$STATE_TEXT_FLOAT => 'oword',
);
# State: is a text state..."include state" is more correct
sub state_is_text {
my $st=shift @_;
return ($st>=$STATE_TEXT);
}
# State: is a parsed/included region, text or preamble
sub state_is_parsed {
my $st=shift @_;
return ($st>=$STATE_TEXT || $st==$STATE_PREAMBLE);
}
# State: get CNT corresponding to text state (or undef)
sub state_text_cnt {
my $st=shift @_;
return $state2cnt{$st};
}
# State: is an exclude state
sub state_is_exclude {
my $st=shift @_;
return ($st<=$__STATE_EXCLUDE_);
}
# State: \begin and \end should be processed
sub state_inc_envir {
my $st=shift @_;
return ($st>$STATE_EXCLUDE_STRONG);
}
# State as text (used with printstate)
# TODO: Should do a conversion based on STATE values.
sub state_to_text {
my $st=shift @_;
return $st;
}
# Style to use with text state
sub state_to_style {
return $state2style{shift @_};
}
# Add new counter with the given key and description
sub add_new_counter {
my ($key,$desc,$like)=@_;
my $state=$SIZE_CNT;
my $cnt=$SIZE_CNT;
$key=lc($key);
if (!defined $like){$like=$CNT_WORDS_OTHER;}
$key2cnt{$key}=$cnt;
push @countkey,$key;
push @countdesc,$desc;
if (defined $sumweights[$like]) {$sumweights[$cnt]=$sumweights[$like];}
$key2state{$key}=$state;
$state2cnt{$state}=$cnt;
$state2style{$state}='altwd';
push @STATE_MID_PRIORITY,$state;
$SIZE_CNT++;
}
###### Set global definitions
### Break points
# Definition of macros that define break points that start a new subcount.
# The values given are used as labels.
my %BreakPointsOptions;
$BreakPointsOptions{'none'}={};
$BreakPointsOptions{'part'}={%{$BreakPointsOptions{'none'}},'\part'=>'Part'};
$BreakPointsOptions{'chapter'}={%{$BreakPointsOptions{'part'}},'\chapter'=>'Chapter'};
$BreakPointsOptions{'section'}={%{$BreakPointsOptions{'chapter'}},'\section'=>'Section'};
$BreakPointsOptions{'subsection'}={%{$BreakPointsOptions{'section'}},'\subsection'=>'Subsection'};
$BreakPointsOptions{'default'}=$BreakPointsOptions{'subsection'};
my %BreakPoints=%{$BreakPointsOptions{'none'}};
### Print styles
# Definition of different print styles: maps of class labels
# to ANSI codes. Class labels are as used by HTML styles.
my %STYLES;
my $STYLE_EMPTY=' ';
my $STYLE_BLOCK='-';
my $NOSTYLE=' ';
$STYLES{'Errors'}={'error'=>'bold red'};
$STYLES{'Words'}={'word'=>'blue','hword'=>'bold blue','oword'=>'blue','altwd'=>'blue'};
$STYLES{'Macros'}={'cmd'=>'green','fileinc'=>'bold green'};
$STYLES{'Options'}={'option'=>'yellow','optparm'=>'green'};
$STYLES{'Ignored'}={'ignore'=>'cyan','math'=>'magenta'};
$STYLES{'Excluded'}={'exclcmd'=>'yellow','exclenv'=>'yellow','exclmath'=>'yellow','mathcmd'=>'yellow'};
$STYLES{'Groups'}={'document'=>'red','envir'=>'red','mathgroup'=>'magenta'};
$STYLES{'Comments'}={'tc'=>'bold yellow','comment'=>'yellow'};
$STYLES{'Sums'}={'cumsum'=>'yellow'};
$STYLES{'States'}={'state'=>'cyan underline'};
$STYLES{'<core>'}={%{$STYLES{'Errors'}},$STYLE_EMPTY=>$NOSTYLE,'<printlevel>'=>1};
$STYLES{0}={%{$STYLES{'Errors'}},'<printlevel>'=>0};
$STYLES{1}={%{$STYLES{'<core>'}},%{$STYLES{'Words'}},%{$STYLES{'Groups'}},%{$STYLES{'Sums'}}};
$STYLES{2}={%{$STYLES{1}},%{$STYLES{'Macros'}},%{$STYLES{'Ignored'}},%{$STYLES{'Excluded'}}};
$STYLES{3}={%{$STYLES{2}},%{$STYLES{'Options'}},%{$STYLES{'Comments'}},'<printlevel>'=>2};
$STYLES{4}={%{$STYLES{3}},%{$STYLES{'States'}}};
$STYLES{'All'}=$STYLES{4};
my %STYLE=%{$STYLES{$defaultVerbosity}};
my @STYLE_LIST=('error','word','hword','oword','altwd',
'ignore','document','cmd','exclcmd','option','optparm','envir','exclenv',
'mathgroup','exclmath','math','mathcmd','comment','tc','fileinc','state','cumsum');
my %STYLE_DESC=(
'error' => 'ERROR: TeXcount error message',
'word' => 'Text which is counted: counted as text words',
'hword' => 'Header and title text: counted as header words',
'oword' => 'Caption text and footnotes: counted as caption words',
'altwd' => 'Words in user specified counters: counted in separate counters',
'ignore' => 'Ignored text or code: excluded or ignored',
'document' => '\documentclass: document start, beginning of preamble',
'cmd' => '\macro: macro not counted, but parameters may be',
'exclcmd' => '\macro: macro in excluded region',
'option' => '[Macro options]: not counted',
'optparm' => '[Optional parameter]: content parsed and styled as counted',
'envir' => '\begin{name} \end{name}: environment',
'exclenv' => '\begin{name} \end{name}: environment in excluded region',
'mathgroup' => '$ $: counted as one equation',
'exclmath' => '$ $: equation in excluded region',
'math' => '2+2=4: maths (inside $...$ etc.)',
'mathcmd' => '$\macro$: macros inside maths',
'comment' => '% Comments: not counted',
'tc' => '%TC:TeXcount instructions: not counted',
'fileinc' => 'File to include: not counted but file may be counted later',
'state' => '[state]: internal TeXcount state',
'cumsum' => '[cumsum]: cumulative sum count');
###### Define what a word is and language options
# Patters matching a letter. Should be a single character or
# ()-enclosed regex for substitution into word pattern regex.
my @LetterMacros=qw/ae AE o O aa AA oe OE ss
alpha beta gamma delta epsilon zeta eta theta iota kappa lamda
mu nu xi pi rho sigma tau upsilon phi chi psi omega
Gamma Delta Theta Lambda Xi Pi Sigma Upsilon Phi Psi Omega
/;
my $specialchars='\\\\('.join('|',@LetterMacros).')(\{\}|\s*|\b)';
my $modifiedchars='\\\\[\'\"\`\~\^\=](@|\{@\})';
my %NamedLetterPattern;
$NamedLetterPattern{'restricted'}='@';
$NamedLetterPattern{'default'}='('.join('|','@',$modifiedchars,$specialchars).')';
$NamedLetterPattern{'relaxed'}=$NamedLetterPattern{'default'};
my $LetterPattern=$NamedLetterPattern{'default'};
# List of regexp patterns that should be analysed as words.
# Use @ to represent a letter, will be substituted with $LetterPattern.
# Named patterns may replace or be appended to the original patterns.
# Apply_Options() results in a call to apply_language_options() which
# constructs $WordPattern based on $LetterPattern, @WordPatterns and
# alphabet/logogram settings.
my %NamedWordPattern;
$NamedWordPattern{'letters'}='@';
$NamedWordPattern{'words'}='(@+|@+\{@+\}|\{@+\}@+)([\-\'\.]?(@+|\{@+\}))*';
my @WordPatterns=($NamedWordPattern{'words'});
my $WordPattern; # Regex matching a word (defined in apply_language_options())
### Macro option regexp list
# List of regexp patterns to be gobbled as macro option in and after
# a macro.
my %NamedMacroOptionPattern;
$NamedMacroOptionPattern{'default'}='\[[^\[\]\n]*\]';
$NamedMacroOptionPattern{'relaxed'}='\[[^\[\]\n]*(\n[^\[\]\n]+)\n?\]';
$NamedMacroOptionPattern{'restricted'}='\[(\w|[,\-\s\~\.\:\;\+\?\*\_\=])*\]';
my $MacroOptionPattern=$NamedMacroOptionPattern{'default'};
### Alternative language encodings
my %NamedEncodingGuessOrder;
$NamedEncodingGuessOrder{'chinese'}=[qw/utf8 gb2312 big5/];
$NamedEncodingGuessOrder{'japanese'}=[qw/utf8 euc-jp iso-2022-jp jis shiftjis/];
$NamedEncodingGuessOrder{'korean'}=[qw/utf8 euc-kr iso-2022-kr/];
###### Define character classes (alphabets)
### Character classes to use as Unicode properties
# Character group representing digits 0-9 (more restrictive than Digits)
sub Is_digit { return <<END;
0030\t0039
END
}
# Character group representing letters (excluding logograms)
sub Is_alphabetic { return <<END;
+utf8::Alphabetic
-utf8::Ideographic
-utf8::Katakana
-utf8::Hiragana
-utf8::Thai
-utf8::Lao
-utf8::Hangul
END
}
# Character group representing letters (excluding logograms)
sub Is_alphanumeric { return <<END;
+utf8::Alphabetic
+utf8::Digit
-utf8::Ideographic
-utf8::Katakana
-utf8::Hiragana
-utf8::Thai
-utf8::Lao
-utf8::Hangul
END
}
# Character class for punctuation excluding special characters
sub Is_punctuation { return <<END;
+utf8::Punctuation
-0024\t0025
-005c
-007b\007e
END
}
# Character group representing CJK characters
sub Is_cjk { return <<END;
+utf8::Han
+utf8::Katakana
+utf8::Hiragana
+utf8::Hangul
END
}
# Character group for CJK punctuation characters
sub Is_cjkpunctuation { return <<END;
3000\t303f
2018\t201f
ff01\tff0f
ff1a\tff1f
ff3b\tff3f
ff5b\tff65
END
}
###### Define core rules
### Macros indicating package inclusion
# Will always be assumed to take one parameter (plus options).
# Gets added to TeXmacro. After that, values are not used, only membership.
my %TeXpackageinc=('\usepackage'=>1,'\RequirePackage'=>1);
### Macros that are counted within the preamble
# The preamble is the text between \documentclass and \begin{document}.
# Text and macros in the preamble is ignored unless specified here. The
# value is the states (1=text, 2=header, etc.) they should be interpreted as.
# Note that only the first unit (token or {...} block) is counted.
# Gets added to TeXmacro. Is used within preambles only.
my %TeXpreamble;
add_keys_to_hash(\%TeXpreamble,['header'],'\title');
add_keys_to_hash(\%TeXpreamble,['other'],'\thanks');
add_keys_to_hash(\%TeXpreamble,['xxx','xxx'],'\newcommand','\renewcommand');
add_keys_to_hash(\%TeXpreamble,['xxx','xxx','xxx'],'\newenvironment','\renewenvironment');
### In floats: include only specific macros
# Macros used to identify caption text within floats.
# Gets added to TeXmacro. Is used within floats only.
my %TeXfloatinc=('\caption'=>['otherword']);
### How many tokens to gobble after macro
# Each macro is assumed to gobble up a given number of
# tokens (or {...} groups), as well as options [...] before, within
# and after. The %TeXmacro hash gives a link from a macro
# (or beginNAME for environment with no the backslash)
# to either an integer giving the number of tokens to ignore
# or to an array (specified as [rule,rule,...]) of length N where
# N is the number of parameters to be read with the macro. The
# array values tell how each is to be interpreted (see the parser state
# keywords for valid values). Thus specifying a number N is
# equivalent to specifying an array of N 'ignore' rules.
#
# For macros not specified here, the default value is 0: i.e.
# no tokens are excluded, but [...] options are.
my %TeXmacro=(%TeXpreamble,%TeXfloatinc,%TeXpackageinc);
add_keys_to_hash(\%TeXmacro,['text'],
'\textnormal','\textrm','\textit','\textbf','\textsf','\texttt','\textsc','\textsl','\textup','\textmd',
'\makebox','\mbox','\framebox','\fbox','\uppercase','\lowercase','\textsuperscript','\textsubscript',
'\citetext');
add_keys_to_hash(\%TeXmacro,['[','text'],
'\item');
add_keys_to_hash(\%TeXmacro,['[','ignore'],
'\linebreak','\nolinebreak','\pagebreak','\nopagebreak');
add_keys_to_hash(\%TeXmacro,0,
'\maketitle','\indent','\noindent',
'\centering','\raggedright','\raggedleft','\clearpage','\cleardoublepage','\newline','\newpage',
'\smallskip','\medskip','\bigskip','\vfill','\hfill','\hrulefill','\dotfill',
'\normalsize','\small','\footnotesize','\scriptsize','\tiny','\large','\Large','\LARGE','\huge','\Huge',
'\normalfont','\em','\rm','\it','\bf','\sf','\tt','\sc','\sl',
'\rmfamily','\sffamily','\ttfamily','\upshape','\itshape','\slshape','\scshape','\mdseries','\bfseries',
'\selectfont',
'\tableofcontents','\listoftables','\listoffigures');
add_keys_to_hash(\%TeXmacro,1,
'\begin','\end',
'\documentclass','\documentstyle','\hyphenation','\pagestyle','\thispagestyle',
'\author','\date',
'\bibliographystyle','\bibliography','\pagenumbering','\markright',
'\includeonly','\includegraphics','\special',
'\label','\ref','\pageref','\bibitem',
'\eqlabel','\eqref','\hspace','\vspace','\addvspace',
'\newsavebox','\usebox',
'\newlength','\newcounter','\stepcounter','\refstepcounter','\usecounter',
'\fontfamily','\fontseries',
'\alph','\arabic','\fnsymbol','\roman','\value',
'\typeout', '\typein','\cline');
add_keys_to_hash(\%TeXmacro,2,
'\newfont','\newtheorem','\sbox','\savebox','\rule','\markboth',
'\setlength','\addtolength','\settodepth','\settoheight','\settowidth','\setcounter',
'\addtocontents','\addtocounter',
'\fontsize');
add_keys_to_hash(\%TeXmacro,3,'\multicolumn','\addcontentsline');
add_keys_to_hash(\%TeXmacro,6,'\DeclareFontShape');
add_keys_to_hash(\%TeXmacro,['[','text','ignore'],
'\cite','\nocite','\citep','\citet','\citeauthor','\citeyear','\citeyearpar',
'\citealp','\citealt','\Citep','\Citet','\Citealp','\Citealt','\Citeauthor');
add_keys_to_hash(\%TeXmacro,['ignore','text'],'\parbox','\raisebox');
add_keys_to_hash(\%TeXmacro,['otherword'],'\marginpar','\footnote','\footnotetext');
add_keys_to_hash(\%TeXmacro,['header'],
'\title','\part','\chapter','\section','\subsection','\subsubsection','\paragraph','\subparagraph');
add_keys_to_hash(\%TeXmacro,['xxx','xxx','text'],'\multicolumn');
add_keys_to_hash(\%TeXmacro,['xxx','xxx'],'\newcommand','\renewcommand');
add_keys_to_hash(\%TeXmacro,['xxx','xxx','xxx'],'\newenvironment','\renewenvironment');
### Environments
# The %TeXenvir hash provides content parsing rules (parser states).
# Environments that are not defined will be counted as the surrounding text.
#
# Parameters taken by the \begin{environment} are defined in %TeXmacro.
#
# Note that some environments may only exist within math-mode, and
# therefore need not be defined here: in fact, they should not as it
# is not clear if they will be in inlined or displayed math.
my %TeXenvir;
add_keys_to_hash(\%TeXenvir,'ignore',
'titlepage','tabbing','tabular','tabular*','thebibliography','lrbox');
add_keys_to_hash(\%TeXenvir,'text',
'document','letter','center','flushleft','flushright',
'abstract','quote','quotation','verse','minipage',
'description','enumerate','itemize','list',
'theorem','thm','lemma','definition','corollary','example','proof','pf');
add_keys_to_hash(\%TeXenvir,'inlinemath',
'math');
add_keys_to_hash(\%TeXenvir,'displaymath',
'displaymath','equation','equation*','eqnarray','eqnarray*','align','align*',);
add_keys_to_hash(\%TeXenvir,'float',
'float','picture','figure','figure*','table','table*');
add_keys_to_hash(\%TeXenvir,'xall',
'verbatim','tikzpicture');
# Environment parameters
my $PREFIX_ENVIR='begin'; # Prefix used for environment names
add_keys_to_hash(\%TeXmacro,1,
'beginthebibliography','beginlrbox','beginminipage');
add_keys_to_hash(\%TeXmacro,2,
'beginlist');
add_keys_to_hash(\%TeXmacro,['ignore'],
'beginletter');
add_keys_to_hash(\%TeXmacro,['xxx'],
'begintabular');
add_keys_to_hash(\%TeXmacro,['ignore','xxx'],
'begintabular*');
add_keys_to_hash(\%TeXmacro,['[','text'],
'begintheorem','beginthm','beginlemma','begindefinition','begincorollary','beginexample','beginproof','beginpf');
add_keys_to_hash(\%TeXmacro,['nooptions'],
'beginverbatim');
### Macros that should be counted as one or more words
# Macros that represent text may be declared here. The value gives
# the number of words the macro represents.
my %TeXmacrocount=('\LaTeX'=>1,'\TeX'=>1,'beginabstract'=>['header','headerword']);
### Macros for including tex files
# Allows \macro{file} or \macro file. If the value is 0, the filename will
# be used as is; if it is 1, the filetype .tex will be added if the
# filename is without filetype; if it is 2, the filetype .tex will be added.
my %TeXfileinclude=('\input'=>'input','\include'=>'texfile');
### Convert state keys to codes
convert_hash(\%TeXpreamble,\&keyarray_to_state);
convert_hash(\%TeXfloatinc,\&keyarray_to_state);
convert_hash(\%TeXmacro,\&keyarray_to_state);
convert_hash(\%TeXmacrocount,\&keyarray_to_cnt);
convert_hash(\%TeXenvir,\&key_to_state);
###### Define package specific rules
### Package rule definitions
my %PackageTeXpreamble=(); # TeXpreamble definitions per package
my %PackageTeXpackageinc=(); # TeXpackageinc definitions per package
my %PackageTeXfloatinc=(); # TeXfloatinc definitions per package
my %PackageTeXmacro=(); # TeXmacro definitions per package
my %PackageTeXmacrocount=(); # TeXmacrocount definitions per package
my %PackageTeXenvir=(); # TeXenvir definitions per package
my %PackageTeXfileinclude=(); # TeXfileinclude definitions per package
my %PackageSubpackage=(); # Subpackages to include (listed in array [...])
# Rules for bibliography inclusion
$PackageTeXmacrocount{'%incbib'}={'beginthebibliography'=>['header','hword']};
$PackageTeXmacro{'%incbib'}={'\bibliography'=>1};
$PackageTeXenvir{'%incbib'}={'thebibliography'=>'text'};
$PackageTeXfileinclude{'%incbib'}={'\bibliography'=>'<bbl>'};
# Rules for package alltt
$PackageTeXenvir{'alltt'}={
'alltt'=>'xall'};
# Rules for package babel
# NB: Only core macros implemented, those expected found in regular documents
$PackageTeXenvir{'babel'}={
'otherlanguage'=>'text','otherlanguage*'=>'text'};
$PackageTeXmacro{'babel'}={
'\selectlanguage'=>1,'\foreignlanguage'=>['ignore','text'],
'beginotherlanguage'=>1,'beginotherlanguage*'=>1};
# Rules for package comment
$PackageTeXenvir{'comment'}={
'comment'=>'xxx'};
# Rules for package color
$PackageTeXmacro{'color'}={
'\textcolor'=>['ignore','text'],'\color'=>1,'\pagecolor'=>1,'\normalcolor'=>0,
'\colorbox'=>['ignore','text'],'\fcolorbox'=>['ignore','ignore','text'],
'\definecolor'=>3,\'DefineNamedColor'=>4};
# Rules for package endnotes
$PackageTeXmacro{'endnotes'}={'\endnote'=>['oword'],'\endnotetext'=>['oword'],'\addtoendnotetext'=>['oword']};
# Rules for package fancyhdr
$PackageTeXmacro{'fancyhdr'}={
'\fancyhf'=>1,'\lhead'=>1,'\chead'=>1,'\rhead'=>1,'\lfoot'=>1,'\cfoot'=>1,'\rfoot'=>1};
# Rules for package geometry
$PackageTeXmacro{'geometry'}={
'\geometry'=>1,'\newgeometry'=>1,'\restoregeometry'=>0,,'\savegeometry'=>1,'\loadgeometry'=>1};
# Rules for package graphicx
$PackageTeXmacro{'graphicx'}={
'\DeclareGraphicsExtensions'=>1,'\graphicspath'=>1,
'\includegraphics'=>['[','ignore','ignore'],
'\includegraphics*'=>['[','ignore','[','ignore','[','ignore','ignore'],
'\rotatebox'=>1,'\scalebox'=>1,'\reflectbox'=>1,'\resizebox'=>1};
# Rules for package hyperref (urls themselves counted as one word)
# NB: \hyperref[label]{text} preferred over \hyperref{url}{category}{name}{text}
# NB: Macros for use in Form environments not implemented
$PackageTeXmacro{'hyperref'}={
'\hyperref'=>['[','ignore','text'],
'\url'=>1,'\nolinkurl'=>1,'\href'=>['ignore','text'],
'\hyperlink'=>['ignore','text'],'\hypertarget'=>['ignore','text'],
'\hyperbaseurl'=>1,'\hyperimage'=>['ignore','text'],'\hyperdef'=>['ignore','ignore','text'],
'\phantomsection'=>0,'\autoref'=>1,'\autopageref'=>1,
'\hypersetup'=>1,'\urlstyle'=>1,
'\pdfbookmark'=>2,'\currentpdfbookmark'=>2,'\subpdfbookmark'=>2,'\belowpdfbookmark'=>2,
'\pdfstringref'=>2,'\texorpdfstring'=>['text','ignore'],
'\hypercalcbp'=>1,'\Acrobatmenu'=>2};
$PackageTeXmacrocount{'hyperref'}={
'\url'=>1,'\nolinkurl'=>1};
# Rules for package import
$PackageTeXfileinclude{'import'}={
'\import'=>'dir file','\subimport'=>'subdir file',
'\inputfrom'=>'dir file','\subinputfrom'=>'subdir file',
'\includefrom'=>'dir file','\subincludefrom'=>'subdir file'};
# Rules for package inputenc
$PackageTeXmacro{'inputenc'}={
'\inputencoding'=>1};
# Rules for package listings
$PackageTeXenvir{'listings'}={'lstlisting'=>'xall'};
$PackageTeXmacro{'listings'}={'\lstset'=>['ignore'],'\lstinputlisting'=>['ignore']};
# Rules for package psfig
$PackageTeXmacro{'psfig'}={'\psfig'=>1};
# Rules for package sectsty
$PackageTeXmacro{'sectsty'}={
'\allsectionsfont'=>1,'\partfont'=>1,'\chapterfont'=>1,'\sectionfont'=>1,
'\subsectionfont'=>1,'\subsubsectionfont'=>1,'\paragraphfont'=>1,'\subparagraphfont'=>1,
'\minisecfont'=>1,'\partnumberfont'=>1,'\parttitlefont'=>1,'\chapternumberfont'=>1,
'\chaptertitlefont'=>1,'\nohang'=>0};
# Rules for package setspace
$PackageTeXenvir{'setspace'}={
'singlespace'=>'text','singlespace*'=>'text','onehalfspace'=>'text','doublespace'=>'text',
'spacing'=>'text'};
$PackageTeXmacro{'setspace'}={
'beginspacing'=>1,
'\singlespacing'=>0,'\onehalfspacing'=>0,'\doublespacing'=>0,
'\setstretch'=>1,'\SetSinglespace'=>1};
# Rules for package subfiles
$PackageTeXfileinclude{'subfiles'}={
'\subfile'=>'file'};
# Rules for package url
# NB: \url|...| variant not implemented, only \url{...}
# NB: \urldef{macro}{url} will not be counted
$PackageTeXmacro{'url'}={
'\url'=>1,'\urldef'=>2,'\urlstyle'=>1,'\DeclareUrlCommand'=>['ignore','xxx']};
$PackageTeXmacro{'setspace'}={
'\url'=>1};
# Rules for package wrapfig
$PackageTeXenvir{'wrapfig'}={
'wrapfigure'=>'float','wraptable'=>'float'};
$PackageTeXmacro{'wrapfig'}={
'beginwrapfigure'=>2,'beginwraptable'=>2};
# Rules for package xcolor (reimplements the color package)
# NB: only main macros (mostly from package color) included
$PackageTeXmacro{'xcolor'}={
'\textcolor'=>['ignore','text'],'\color'=>1,'\pagecolor'=>1,'\normalcolor'=>0,
'\colorbox'=>['ignore','text'],'\fcolorbox'=>['ignore','ignore','text'],
'\definecolor'=>3,\'DefineNamedColor'=>4,
'\colorlet'=>2};
###### Main script
###################################################
MAIN(@ARGV);
exit; # Just to make sure it ends here...
###################################################
#########
######### Main routines
#########
# MAIN ROUTINE: Handle arguments, then parse files
sub MAIN {
my @args;
push @args,@StartupOptions;
push @args,@_;
Initialise();
Check_Arguments(@args);
my @toplevelfiles=Parse_Arguments(@args);
Apply_Options();
if (scalar @toplevelfiles>0 || $fileFromSTDIN) {
if ($showVersion && !$htmlstyle && !($briefsum && $totalflag)) {
print "\n=== LaTeX word count (TeXcount version $versionnumber) ===\n\n";
}
conditional_print_style_list();
my $totalcount=Parse_file_list(@toplevelfiles);
conditional_print_total($totalcount);
Report_Errors();
if ($optionWordFreq || $optionWordClassFreq) {print_word_freq();}
if ($optionMacroStat) {print_macro_stat();}
} elsif ($showcodes>1) {
conditional_print_style_list();
} else {
error($Main,'No files specified.');
}
Close_Output();
}
# Initialise, overrule initial settings, etc.
sub Initialise {
_option_subcount();
# Windows settings
if ($^O eq 'MSWin32') {
} elsif ($^O=~/^MSWin/) {
#DELETE: do not overrule colour setting
option_ansi_colours(0);
}
}
# Check arguments, exit on exit condition
sub Check_Arguments {
my @args=@_;
my $arg=$args[0];
if (!@args) {
print_version();
print_short_help();
exit;
} elsif ($arg=~/^(--?(h|\?|help)|\/(\?|h))$/) {
print_help();
exit;
} elsif ($arg=~/^(--?(h|\?|help)|\/(\?|h))=(.*)$/) {
print_help_on_rule($4);
exit;
} elsif ($arg=~/^(--?(h|\?|help)|\/(\?|h))-?(opt|options?)$/) {
print_syntax();
exit;
} elsif ($arg=~/^(--?(h|\?|help)|\/(\?|h))-?(opt|options?)=(.*)$/) {
print_syntax_subset($5);
exit;
} elsif ($arg=~/^(--?(h|\?|help)|\/(\?|h))-?(styles?)$/) {
print_help_on_styles();
exit;
} elsif ($arg=~/^(--?(h|\?|help)|\/(\?|h))-?(styles?)=(\w+)$/) {
print_help_on_styles($5);
exit;
} elsif ($arg=~/^--?(ver|version)$/) {
print_version();
exit;
} elsif ($arg=~/^--?(lic|license|licence)$/) {
print_license();
exit;
}
return 1;
}
# Parse arguments, set options (global) and return file list
sub Parse_Arguments {
my @args=@_;
my @files;
foreach my $arg (@args) {
if (parse_option($arg)) {next;}
if ($arg=~/^\-/) {
print "Invalid opton $arg \n\n";
print_short_help();
exit;
}
$arg=~s/\\/\//g;
push @files,$arg;
}
return @files;
}
# Parse individual option parameters
sub parse_option {
my $arg=shift @_;
return parse_options_preset($arg)
|| parse_options_parsing($arg)
|| parse_options_counts($arg)
|| parse_options_output($arg)
|| parse_options_format($arg)
;
}
# Parse presetting options
sub parse_options_preset {
my $arg=shift @_;
if ($arg=~/^-(opt|option|options|optionfile)=(.*)$/) {
_parse_optionfile($2);
}
else {return 0;}
return 1;
}
# Parse parsing options
sub parse_options_parsing {
my $arg=shift @_;
if ($arg eq '-') {$fileFromSTDIN=1;}
elsif ($arg eq '-merge') {$includeTeX=2;}
elsif ($arg eq '-inc') {$includeTeX=1;}
elsif ($arg eq '-noinc') {$includeTeX=0;}
elsif ($arg =~/^-(includepackage|incpackage|package|pack)=(.*)$/) {include_package($2);}
elsif ($arg eq '-incbib') {$includeBibliography=1;}
elsif ($arg eq '-nobib') {$includeBibliography=0;}
elsif ($arg eq '-dir') {$globalworkdir=undef;}
elsif ($arg=~/^-dir=(.*)$/) {
$globalworkdir=$1;
$globalworkdir=~s:([^\/\\])$:$1\/:;
}
elsif ($arg eq '-auxdir') {$auxdir=undef;}
elsif ($arg=~/^-auxdir=(.*)$/) {
$auxdir=$1;
$auxdir=~s:([^\/\\])$:$1\/:;
}
elsif ($arg =~/^-(enc|encode|encoding)=(.+)$/) {$encoding=$2;}
elsif ($arg =~/^-(utf8|unicode)$/) {$encoding='utf8';}
elsif ($arg =~/^-(alpha(bets?)?)=(.*)$/) {set_script_options(\@AlphabetScripts,$3);}
elsif ($arg =~/^-(logo(grams?)?)=(.*)$/) {set_script_options(\@LogogramScripts,$3);}
elsif ($arg =~/^-([-a-z]+)$/ && set_language_option($1)) {}
elsif ($arg eq '-relaxed') {
$MacroOptionPattern=$NamedMacroOptionPattern{'relaxed'};
$LetterPattern=$NamedLetterPattern{'relaxed'};
}
elsif ($arg eq '-restricted') {
$MacroOptionPattern=$NamedMacroOptionPattern{'restricted'};
$LetterPattern=$NamedLetterPattern{'restricted'};
}
else {return 0;}
return 1;
}
# Parse count and summation options
sub parse_options_counts {
my $arg=shift @_;
if ($arg =~/^-sum(=(.+))?$/) {_option_sum($2);}