#!/usr/bin/perl

=pod

tatatag.pl
tatatag - a semantic typing machine

Version: August 20, 2025

This is the semantic tagging module of Text·a·gram

For documentation and the latest version go to

https://tecling.com/textagram

(There are other modules for text analysis on this website).

This program takes a folder with plain text as input and
the output is a new folder with html documents containing
the annotations of the input files. It also converts the
html files to odt to facilitate the revision by human annotators.

pandoc -s input.html -o output.odt


Pending:

* Wikipedia could be integrated as a semantic tagger to improve recall
* If an element is not repeated in sequences that have a name and it is
	included in the taxonomy as a noun, then it is not a person
* The output should be a table in a format similar to UD
* We need an English lexical database for the plural of nouns
* A preprocessing is needed to eliminate modalization and discourse markers
	This is very easy to do using Text·a·gram
* We need to integrate linguini, the language detector
	(at the moment, the selection of the language is manual)
	
* "Art director to the company was Thomas Battam"
	Here it says that Battam is UNKNOWN, but if Thomas is a given name,
	then the full name should be tagged as PERSON




Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/

TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

1. Definitions.

"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.

"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.

"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.

"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.

"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.

"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.

"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).

"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.

"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."

"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.

2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.

3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.

4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:

(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and

(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and

(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and

(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.

You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.

5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.

6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.

7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.

8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.

9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.

END OF TERMS AND CONDITIONS

Copyright 2024 Rogelio Nazar

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.





=cut

$|++;
use strict;

my $tit = "Tatatag";
my $subtit = "a semantic typing machine";
my $version = "August 20, 2025: This version works with English and Spanish";
my $url = "https://tecling.com/textagram";
	# This is the website of the textagram project
	# It includes other services other than semantic typing

my ($corpus,  # the folder with the input
$lang, # en / es
$level, # the level of analysis: tab, cat & tag (by default, all)
	# tab: the tabulation of name with category
	# cat: the tabulation of category with name
	# tag: the input text with the tags
	# pending: the UD tag to produce the POS-tag table with the sem-types
	# a csv-output version is on its way
$out
) = @ARGV;

 

$corpus =~ s/\/+$//g;

if (!$corpus || ! -d $corpus || $lang !~ /^(en|es)$/) {
	die "\n 
	$tit - $subtit
	$url
	Version: $version
	
	This is the local verion of Tatatag, the semantic tagger.

	To execute, the script needs two arguments:
	
	1) the name of the folder containing the texts to be processed. Of course, they need to be UNIX plain text files in UTF8 encoding
	
	2) the language (en|es)	
	
	";
}

if (!$out) {
	$out = "output-$lang-".time();
	mkdir $out or die "\nI cannot create the folder for the output ($out).";
}

#print "\nOutput folder: $out";

if (! -d  $out) {
	print "\nOutput folder $out unavailable: $? $!" and exit;
	# I have to use print/exit instead of die for the web environment
}

# some useful subroutines
sub aseptic {
	my $in = $_[0];
	$in =~ s/[`\(\)\[\]\+\*]//g; # \?\.
	return $in;
}

sub readfile {
	my $file = $_[0];
	if (! -e $file ) {
		$file = "./tatatag/".$file;
	}
	open( my $fh, '<', $file ) or print "\n<br>I can't read [$file]" and exit;
	my @cont = <$fh>;
	close $fh;
	return @cont;
}


# These variables are configured according to language (en/es)
my ($langmodel, 
$taxo, 
$vectors, 
$stoplist, 
$lemario,
$minimod, # this is a small language model (of word frequencies)
$semtypes,
$periods,
$unamb,
$triggers,
);


my $umbini = 1000; # this is a threshold to determine when a word is too common 

my $nonos = 'piece|part|branch|other|something|parte|partir'; 
# these are intractable nouns
# pending: do a better (wider) selection


# language configuration
# pending: syntactic rules to find the head of a noun phrase
# should be here

$stoplist = "shorstoplist_".$lang.".txt";
$minimod = "freclist_".$lang.".txt";
$triggers = "triggers_".$lang.".txt";
$semtypes = "semantypes_".$lang.".txt";
$periods = "periods_".$lang.".txt";
$unamb = "unamb_".$lang.".txt";

if ($lang eq "es") {
	$langmodel = "spanish-ancora-ud-2.5-191206.udpipe";
	$taxo = "taxo1nov2024.csv";
	$vectors = "magnvec2.txt";
	$lemario = 'pulurales.txt';
	$umbini = $umbini * 4000;
} else {
	$langmodel = "english-gum-ud-2.5-191206.udpipe";
	$taxo = "EnglishTaxonomyMay11-2025.txt";
	$vectors = "minimod.csv";
	$lemario = ''; # pending!
}

my $udpipe = "/opt/udpipe/src/udpipe";

my @data = (	"masterPlace13feb2025.txt", 
		"masterPil13feb2025.txt", 
		"masterSurname13feb2025.txt");

my %placard; # these are place triggers (e.g., island, city, airport) obtained from masterPlace
my $params = 500; # this is the dimensionality of the vectors
my %heads; # this hash stores the heads of the noun phrase when their categorization is secure
my %headtab; # this table connects the nounphrases with their heads
my %triggers; # these are elements read from files that are used to guess a type from the context
my @trigal = &readfile($triggers);
my $categ;
foreach my $line (@trigal) {
	chomp $line;
	next if (!$line);
	if ($categ && $line =~ s/^\t//g) {
		$triggers{$categ}{$line}++;
	} else {
		$categ = $line;
	}
}


############################
# If we are still here, it means there is some input, so the process begins.
##########################

# The output is in html, so we need a header
my $head = "<html>
<head>
<title>$tit: $subtit</title>
<meta http-equiv='Content-Type' content='text/html; charset=utf-8'>
<meta http-equiv='Pragma' content='no-cache'>
<meta http-equiv='Expires' content='-1'>	
<meta name='viewport' content='width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=0'/>
<link rel='stylesheet' href='/style/tecling.css' type='text/css' charset='utf-8'>
<style type='text/css'>
	body {
		font-family: 'telluralregular';
	}
	</style>
</head>
<body>
<a href='$url'>
<h1>$tit: $subtit</h1>
$url
</a>
<br><br>Version: $version";

my $html = $head;
my @textoform; # the tokens of the text
my @textolem; # the lemmas of the text
my %lema; # lemmas from the tagger
my %lemata; # lemmas obtained from lexical database (if available)
my %pulu; # plural forms obtained from lexical database (if available)
my $br = "\n<br>"; # line break 
my %master; # this is the gazetteer

my %tags = ( # these are the names of the files containing the proper nouns
	$data[0] => "LUGAR",
	$data[1] => "PILA",
	$data[2] => "APELLIDO",
);

print "\n<br>Loading data";

#print "\n<br>Reading stoplist...";
print ".";

my %stop;
my @stop = &readfile($stoplist);
foreach my $line (@stop) {
	chomp $line;
	$stop{$line}++;
}

#print "\n<br>Loading minimod...";
print ".";
my %minimod;
my @m = &readfile($minimod);
foreach my $line (@m) {
	chomp $line;
	my @l = split /\t/, $line;
	$minimod{$l[0]} += $l[1];
}

#print "\n<br>Loading gazzetteer...";
print ".";
foreach my $f (@data) {
	my @file = &readfile($f);
	foreach my $line (@file) {
		chomp $line;
		$line = lc $line;
		if ($f eq $data[0] && $line =~ / /) {
			# we are in the Place file
			my @split = split / /, $line;
			foreach my $part (@split) {
				$placard{$part}++;
			}
		}
		my ($nom, $gen) = split /\t/, $line;
		if ($gen eq "m") { # the gender is not needed now, 
				# but it may come handy in the future
			$gen = "hombre";
		} elsif ($gen eq "f") {
			$gen = "mujer";
		} elsif ($f eq $data[1]) {
			$gen = "indefinido";
		}
		if (length($nom) > 2) {
			if ($gen) {
				$master{$f}{$nom} = $gen;
			} else {
				$master{$f}{$nom}++;
			}
		}
	}
}

# here we load the lexical database. 
# If there is no database, the system will use the lemmatization of the POS-tagger
if (-e $lemario) {
 # print "\n<br>Reading lexical database...";
 print ".";
  my @pulu = &readfile($lemario);
  foreach my $line (@pulu) {
	chomp $line;
	my @cols = split /\t/, $line if ($line =~ /\t/);
	my $pulu = shift @cols;
	my $lema;
	foreach my $case (@cols) {
		if ($case =~ s/^N[^ ]+ //g) {
			$case =~ s/.+://g;
			if ($case ne $pulu) {
				$lema{$pulu} = $case;
				$pulu{$case} = $pulu;
			}
		}
	}
  }
}

#print "\n<br>Reading semantic types...";
print ".";
my @semtypes = &readfile($semtypes);
my %semtypes;
foreach my $type (@semtypes) {
	chomp $type;
	$semtypes{$type}++;
}

#print "\nSemantic types in [$lang]: ".(scalar keys %semtypes);

my %tr = ( # this is a small translation table
    PERSONA => { en => "PERSON", es => "PERSONA" },
    LUGAR => { en => "PLACE", es => "LUGAR" },
    PERIODO => { en => "PERIOD", es => "PERIODO" },
    EVENTO  => { en => "EVENT", es => "EVENTO" },
    INSTITUTION  => { en => "INSTITUTION", es => "INSTITUCIÓN" },
);


# these ones are so frequent and unambiguous
# that they justify their own rules
#print "\n<br>Reading more data...";
print ".";
my @unamb = &readfile($unamb);
my %unamb;
my $categ;
foreach my $line (@unamb) {
	chomp $line;
	next if (!$line);
	if ($categ && $line =~ s/^\t//g) {
		$unamb{$line} = $categ;
	} else {
		$categ = $line;
	}
}

# these are common expression for time periods
my @periodkey = &readfile($periods);

foreach my $m (@periodkey) {
	chomp $m;
	$master{period}{lc($m)}++; 
}

my @cols = ("Periodo", "Lugar", "Pila", "Surname", "SemType", "Tatatag");

my $toksep = ' ,;\)\(\.:\?'; # do not delete the space at the beginning
 
my (%found, # total number of entries in the taxonomy
 %hypos,
 %cat,
 %defs, # the definitions from the Wiktionary
 %senses, # definitions turned into vectors
 %chain, # the hypernymy chains
 $grifo,
 $sense,
 $entry,
 $idnum,
 $hyper,

 # the following are global but reset with each text
 
 $text, # the content of the text
 %report, # a report of what happens with each entry
 %objects, # the objects found in the text
 %peoparts, # if it's a multiword expression and a proper names, its parts are stored here
 
 ); 

#print "\n<br>Ok. Loading taxonomy...";
print ".";
my @taxo = &readfile($taxo);

foreach my $line (@taxo) {
	chomp $line;
	last if ($line eq "FRACAS:");
	next if ($line =~ /Sin resultado:/);
	$line =~ s/\*//g;
	if ($line =~ /\-\-\-+([^\-]+)\-\-\-+/) {
		$entry = $1;
		$grifo++;
		$found{$entry}++;
		$idnum++;
		$sense = -1;
		next;
	} elsif ($grifo && $line =~ s/^(\t+)([^\t]+)//) {	
		my $chich = $2;
		$hyper = $chich if (!$hyper);

		# Here I try to reject false hypernyms contained in the taxonomy
		# pending: clean the taxonomy
		undef $hyper if ($hyper =~ /^($nonos)$/);
		next if (!$hyper);

		# longer definitions attract to many contexts
		# so I have to normalyze their size
		# In all cases I keep only the first 12 words
		my @t = split /[$toksep]+/, $line;
		my $head;
		foreach my $i (0 .. 12) { 
			$head .= " ".$t[$i] if ($t[$i]);
		}
		push (@{$chain{$entry}{$hyper}}, $chich);
		$defs{$entry}{$hyper} .= "\n " . $hyper . " ". $head if ($head);
		
	} elsif ($grifo) {
		$sense++;
		undef $hyper;
		next;
	}
}

my $load = scalar keys %found;
#print "\n<br>Number of entries in the taxonomy: $load";

# Now I have to vectoryze the definitions
# pending: I need to factoryze the code with function. There is a lot of repeated code.
foreach my $entry (sort keys %defs) { 
   foreach my $sense (sort keys %{$defs{$entry}}) { 
	chomp $defs{$entry}{$sense};
	$senses{$entry}{$sense} = lc $defs{$entry}{$sense};
	my @def = split /[$toksep]+/, $senses{$entry}{$sense};
	undef $senses{$entry}{$sense};
	foreach my $tok (@def) {
		next if ($stop{$tok});
		$senses{$entry}{$sense}{$tok}++;
		if ($pulu{$tok}) {
			$senses{$entry}{$sense}{$pulu{$tok}}++;
		}
		if ($lema{$tok}) {
			$senses{$entry}{$sense}{$lema{$tok}}++;
		}
	}
   }
}

# Now I load the small language model
my %ponder; # this is the weight of the co-occurrences
my %disp; # the dispersion of the co-occurrence, used against them
# print "\n<br>Loading database...";
print ".";
my $stop;
my $first;
my $word;
my $frec; 

# pending: the Spanish version of the model does not include frequency
# It should.
# For the moment, I'll let it ignore the frequency in this language

if ( ! -e $vectors ) {
	$vectors = "./tatatag/".$vectors;
}

open( my $fh, '<', $vectors ) 
	or print "\nI cannot open file $vectors!!!" and exit;
while ( my $line = <$fh> ) {
	chomp $line;
	if ( $line =~ /^[^\t ]+/ ) {
		($word, $frec) = split /\t/, $line;
		undef $stop;
		undef $first;
	} elsif ( $line =~ /^\t([^\t ]+)\t([^\t ]+)/ ) {
		my $cooc = $1;
		my $mi = $2;
		next if ($stop{$cooc});
		if ($lang eq "es") {
			$first = $mi if (!$first);
				# here is where I ignore frequency in Spanish
				# instead, it takes the frequency of the most frequent co-occurrence 
				# as reference.
				# At some point in the future it will be corrected.
			$disp{$cooc}++;
			$stop++;
			if ($stop < $params) { 
				$ponder{$word}{$cooc} = sprintf("%.2f", 10*($mi/$first));
			}	
		} else {
			$ponder{$word}{$cooc} = $mi;
		}
	}
}
close $fh;


# We loaded all the necessary data
# Now we proceed to tag one document at a time.

print "\nOk. Processing the texts...";

opendir my $handle, $corpus or die "Could not open '$corpus' for reading: $!\n";
my @files = readdir $handle;
my $i;
my @goodies; # the docs to analyze
foreach my $txt (sort @files ) {
	next if ($txt =~ /^\.+/ || $txt =~ /\.tag$/);
	if (! -T $corpus."/".$txt) {
		print "\n<br>File $txt does not look like plain text.";
		next;
	}
	push (@goodies, $txt);
}

my $tot = scalar (@goodies);
foreach my $txt (sort @goodies ) {
	$i++;
	my $per = int (100* ($i/$tot));
	my $prog = "\nText $txt ($i/$tot: $per%)\n";
	print $prog;	
	
	my $taggedtext = $corpus."/".$txt.".tag";
	
	if (! -e $taggedtext) {
		#print "\nThis text is not yet tagged. Calling the tagger. Sit tight...";
		&postag($corpus."/".$txt);
	}
	
	if (! -e $taggedtext) {
		print "\nError: file $taggedtext not found ($? $!)";
		exit;
	}
	
	&barrido($corpus,$txt);
}

sub postag {
	# For a question of efficiency, we separate the POS-tagging
	# from the semantic tagging
	my $textotag = $_[0];
	
	##################################################
	#print "\nOk. \n<br>Tagging the text $textotag...";
	#######################################################
		
	# we call UDPIPE

	if (! -e $udpipe) {
		die "\nError: the tagger can't be found! Check the path: $udpipe";
	}
	my $confudp = " --tokenize --tag --parse /opt/udpipe/languages_models/$langmodel";	
	my $orden = "cat '$textotag' | $udpipe $confudp > $textotag.tag";
	my $tagged = `$orden`;
	print "\n Ok.";
}

sub barrido {
	my ($corpus, $txt) = @_;
	my $taggedtext = $corpus."/".$txt.".tag";
	my $baked = "\n<hr> Result for text <i>$taggedtext</i>";
	my $tabulars; # I put the summary tables after the tagged text
	my @file = &readfile($taggedtext);
	my %sustant; # nouns
	my @pos; # POS tags 
	my $enclit; # a defect of UDPipe with enclitic pronouns
	my @buf;
	my @sintag; # This array stores noun phrases
		# It's a local memory. When a name is detected, it goes from
		# here to hash %objects
		# and this array is destroyed
	my %vals; # these are the values of every individual component
	my %mem; # this is a short-term memory of everything that is detected
	my %glomem; # this instead is a long-term memory
	undef %report;
	
	# The first pass is just for the chunking
	# Then we proceed with the classification
	# all chunks are stored in this hash, as previously explained
	undef %objects;
	my @text; # this is the text as it comes. It's stored here in order to be reproduced later
	
	foreach my $line (@file) {
		chomp $line;
		if ($line eq "# newpar") {
			push (@textoform, $br);
			push (@textolem, $br);
			push (@pos, $line);
			next;	
		} elsif ($line =~ s/# text = //) {
			push (@text,  "\n"); 
			next;
		}
		
		my ($num, $tok, $lem, $pos, $copypos, $morf) = split /\t/, $line;
		push (@text, $tok);
		push (@buf, $tok);
		while (scalar(@buf)>5) {
			shift @buf;
		}
		
		my $lor = $tok;
		next if (!$lor || $lor =~ /[0-9]/);
		$tok = lc ($tok); 
		$lem = lc ($lem);
		if ($lem eq "viru") { # yes, this is a common error in the lemmatization of the POS-tagger
			$lem = "virus"; 
		}
		$tok = &aseptic($tok);
		$lem = &aseptic($lem);
		push (@textoform, $tok);
		push (@textolem, $lem);
		my $lastpos = $pos[$#pos]; 
		push (@pos, $pos); 
		if ($lema{$tok} && $lema{$tok} ne $lem) {
			if ($senses{$lema{$tok}} && !$senses{$lem}) {
				$lem = $lema{$tok};
			}
		}
		$lemata{$tok} = $lem;
		$sustant{$lem}++;
		$vals{$lor}{'pos'} = $pos;
		if ($pos =~ /PROPN|NOUN/ ) {
			push (@sintag,  $lor); 	
		} else {
			my $sint = join " ", @sintag;
			$sint = lc $sint;		
			$sint =~ s/^[\-]+//g;
			$sint =~ s/[\(\)\*\+\?\[\]_\-]+/ /g;
			$sint =~ s/  / /g;
			if ($sint !~ / / && $lema{$sint}) {
				$sint = $lema{$sint};
				# pending: this is problematic because sometimes
				# a proper noun, like "Rosas", is destroyed with this rule
				# It should check if it's not a proper noun before applying the rule
			}
			
			if ($sint 
				&& length($sint) < 70 
				&& length($sint) > 2 
				&& $sint !~ /\.$/ 
				&& $sint !~ /^($nonos)$/
				&& $sint ne "ver" 
				&& $sint !~ / ver$/ 
				&& $sint !~ /wikidata/			
			){
				$objects{$sint}++;
			} 
			undef @sintag;	
		}
	}
		
	# I present first the text without annotations
	# because it makes revision easier
	$text = join " ", @text;	
	$text =~ s/\n/\n<br>/g;
	if (!$level) { 	
		$baked .=  "\n<h2>Text without annotations</h2>	
			<table style='width: 100%;' border=1>
		<tr>
		<td valign=top>$text</td>
		</tr>
		</table><br><br>";	
	}
	$text = lc (join " ", @text); # this one is global because it is used by other subroutines 
	$text =~ s/_+/ /g;
	my %length; # the length of the objects, needed for the annotation (longer ones go first)
	my %tatatags;
	my %cats;
	my %sort;
	my %story;
	undef %peoparts;

	# the first round is to calculate. The second one is to print.
	
	foreach my $o (sort {$objects{$b} <=> $objects{$a} } keys %objects) {	
		my ($tag, $story, $altern) = &newtat($o, $text);		
		if ($tag eq $tr{PERSONA}{$lang} && $o =~ / /) {
			my @parts = split / +/, $o;
			foreach my $p (@parts) {
				next if (length($p) < 3);
				$peoparts{$p}++;
			}
		}
		
		$story{$o} = $story;
		# this one feeds %heads
		$length{$o} = length($o);
		next if (!$o || !$tag);
		$tatatags{$o} = $tag;
		$cats{$tag}{$o}++;
		$sort{$tag}++;
	}
	
	if (!$level || $level =~ /tab|htm/) {
		$tabulars =  "\n<h2>Objects found</h2>
		<table style='width: 100%;' border=1>
		<tr>
		<td valign=top style='width: 5%;'><b>Id</b></td>
		<td valign=top style='width: 5%;'><b>Form</b></td>
		<td valign=top style='width: 5%;'><b>Freq</b></td>
		<td valign=top style='width: 5%;'><b>Tag</b></td>
		<td valign=top style='width: 70%;'><b>Context</b></td>
		<td valign=top style='width: 5%;'><b>Error</b></td>
		</tr>";	
		
		#<td valign=top style='width: 5%;'><b>Notes</b></td>
	}
	my $id;
	my $misses; # this is used to calculate recall
	foreach my $o (sort {$objects{$b} <=> $objects{$a} } keys %objects) {
		next if (length($o) < 3);
		if ($heads{$o} && $tatatags{$o} ne $heads{$o}) {
				# we already found a tag for this element,
				# so the result is overruled
				$tatatags{$o} = $heads{$o};
				$story{$o} = "Overruled by head ($heads{$o})";
		}
		
		# pending:
		# The head has already been extracted
		# This is redundant
		if ($o =~ / /) {		
			# it's a multi-word expression, so I extract the head
			# This depends on the language
			my $head;
			if ($lang eq "en") {
				$o =~ / ([^ ]+)$/;
				$head = $1;
			} else {
				$o =~ /^([^ ]+) /;
				$head = $1;		
			}
				
			if ($heads{$head} && $tatatags{$o} ne $heads{$head}) {
				# result if overruled
				$tatatags{$o} = $heads{$head};
				$story{$o} = "Overruled by head ($heads{$o})";
			}
		
		}
		
		if (!$report{$o} && $headtab{$o}) {
			# it has no report because it is a multi-word expression
			# so I must get the head, because that is where the report is stored
			$report{$o} = $report{$headtab{$o}};
		}
		
		if ($report{$o}) {
			$report{$o} =~ s/\n/\n<br>/g;
			$report{$o} =~ s/["']/ /g;
			$report{$o} = "<font size=1>$report{$o}</font>";
		}
		
		if (!$tatatags{$o} || $tatatags{$o} eq "* NO RESULT *") {
			$tatatags{$o} = "<font color=red>UNKNOWN</font>";
			$misses++;
		}
		
		$id++;
		
 		# I paste a context to help the reviewers
		# For this I have to check de contexts of occurrence of the objects
		my $ctxt;
		my @lines = split /\n/, $text;
		foreach my $l (@lines) {
			$l = " $l ";
			if ($l =~ s/[ \-\.,;\(]($o)[ \-\.,;\)]/ <b>$1<\/b> /ig) {
				$ctxt = $l;
				last;
			}
		}
		if ($tabulars) {
		
			$tabulars .=  "\n<tr><td valign=top>$id</td><td valign=top>$o</td><td valign=top>$objects{$o}</td><td valign=top>$tatatags{$o}</td><td valign=top><font size=2>$ctxt</font></td><td valign=top></td></tr>";
			# <td valign=top>$story{$o} <br>$report{$o}</td>
		}
		 	
	}
	if ($tabulars) {
		$tabulars .=  "\n</table>";
	}
	
	if (!$id) {
		print "\n****************************
*		
*		
*	No objects detected for file $txt. Skipping...
*	
**********************************
		
		";
		next;
	}
	
	
	my $rec = sprintf "%.2f", 100* (($id-$misses)/$id);
	
	$baked .=  "\n Recall: $rec%";
	
	
	if (!$level || $level =~ /cat|htm/) {
		$tabulars .=  "
	
		<h2>Objects sorted by category</h2>
	
			<table border=1>
		\n<tr><td valign=top><b>Categoría</b></td>
		<td valign=top><b>Objetos</b></td></tr>";
	
		foreach my $cat (sort {$sort{$b} <=> $sort{$a} }  keys %cats) {
		$tabulars .=  "\n<tr><td valign=top>$cat</td><td valign=top>";
		foreach my $o (sort keys %{$cats{$cat}}) {
			$tabulars .=  "\n<br>$o";
		}	
		$tabulars .=  "\n</td></tr>";
		}
	
		$tabulars .=  "\n</table>";
	}
	
	
	if (!$level || $level =~ /tag|htm/) {
		$baked .= "<h2> Annotated text </h2>";
		
		foreach my $o (sort {$length{$b} <=> $length{$a} } keys %length) {
			next if (!$tatatags{$o});
			$text = " $text ";
			$text =~ s/ ($o) / <b>$1 \[$tatatags{$o}\] <\/b> /ig;		
		}
	
		$text =~ s/ (\[[^\]]+\]) <\/b> \[[^\]]+\]/ $1 <\/b> /g;	
	
		$baked .=  "\n<br><table border=1>
			<tr><td>$text\n</td></tr></table>";
	}

	if ($tabulars) {
		$baked .= $tabulars;
	}

	$txt =~ s/\.....?//g;
	undef $head if ($level);
	open(BARR, ">", $out."/".$txt.".htm") 
		or print "I cannot create result for $txt [$? $!]" and exit;
	print BARR $head. $baked;
	close BARR;
	
	my $formato = "odt";
	
	system ("/usr/bin/pandoc -s $out/$txt.htm -o $out/$txt.$formato");

	if ($? != 0) {
		die "\n Conversion from html to $formato failed to execute for file $txt: $? $!";
	}
	
}

sub newtat {
	# this is the clasification subroutine

	my ($o, $text) = @_; # the input text
	
	return ($unamb{$o}, "unambiguous") if ($unamb{$o});
	
	if ($master{period}{$o}) {
		# it's a time expression
		return ($tr{PERIODO}{$lang}, "Rule 1: [$tr{PERIODO}{$lang}]");
	}

	my $air = " $o "; # we need spaces
	my %rank;
	my %tag;

	my @parts = split / /, $o;
	# if it's a multi-word expression, I need to find the head
	# this changes according to the language
	# pending: to improve this process
	my $head = $parts[0];
	my $tail = $parts[$#parts];
	if ($lang eq "en") {
		$head = $parts[$#parts];
		$tail = $parts[0];			
	}

	return ($unamb{$head}) if ($unamb{$head});

	if ($semtypes{$head}) {
		return (uc $head, "semtype tail");
	}

	my $triggers = join '|', keys %{$triggers{$tr{EVENTO}{$lang}}};
	
	if ($head =~ /^($triggers)s?$/) {
		return ($tr{EVENTO}{$lang}, "event hard rule");
	}

	$triggers = join '|', keys %{$triggers{$tr{INSTITUTION}{$lang}}};

	if ($head =~ /^($triggers)$/ || $o =~ /($triggers) of/) {
		return ($tr{INSTITUTION}{$lang}, "institution hard rule");
	}

	$triggers = join '|', keys %{$triggers{$tr{LUGAR}{$lang}}};

	if ($head =~ /^($triggers)$/) {
		return ($tr{LUGAR}{$lang}, "triggers ($1)");
	}

	if ($o =~ / /) {
		$headtab{$o} = $head;
		# I connect multi-word expressions with their heads
	}

	if ($heads{$head}) {
		return ($heads{$head}, "Rule: headfirst");
	}

	if ($master{period}{$head}) {
		return ($tr{PERIODO}{$lang}, "Rule: PERIOD [$tr{PERIODO}{$lang}]");
	}

	# Now we check the contexts
	my @lines = split /\n/, $text;
	# we go line by line
	my $isplace;
	my $isorg;
	foreach my $line (@lines) {
	
		# pendiente: colegio de san carlos
	
		# avenida
		# calle
		# plaza
		
		# debería operar primero descartando las 
		# categorías principales
		
		# lugar, persona, organización, evento, 
		# y recién entonces buscar lo demás
		
		# podría aprovechar el análisis sintáctico
			
	
		if ($line =~ /(University of|work(ing)? at) $o/i) {
			return ($tr{"INSTITUTION"}{$lang}, "Insti rule ($1 $o)");		
		}
		
		if ($line =~ /((born|died) (at|in) *.{0,15} $o|(of|at) $o, [^ ,;:\.]{4,}[,;:\.]|(of|at) [^ ,;:\.]{4,}, $o\.|of $o *, in |(situated|located) on (the )?$o)/i) {

			# in $o *[,\.]|
			# pending: I should be taking upper case letters into account
			
			# in Redmond, Washington
			# in Steubenville, Ohio
			# of Holzminden, in
			# Lauenförde is situated on the Weser
			return ($tr{LUGAR}{$lang}, "context rule: [$tr{LUGAR}{$lang}] ($1)");
		} 
		
		$triggers = join '|', keys %{$triggers{$tr{LUGAR}{$lang}}};
		if ($line =~ /(($triggers) ((del?|de la) )?$o)/) {
			return ($tr{LUGAR}{$lang}, "context rule: [$tr{LUGAR}{$lang}] ($1)", $1);
		
		} 
		
		$triggers = join '|', keys %{$triggers{$tr{EVENTO}{$lang}}};
		if ($line =~ /(($triggers) (de|a|el|los|las?|una?) $o)/) {
			return ($tr{EVENTO}{$lang}, "context rule: [$tr{EVENTO}{$lang}] ($1)");
		}
		
	}

	$triggers = join '|', keys %{$triggers{$tr{PERSONA}{$lang}}};
	if ($air =~ / ($triggers) /) {
		$heads{$head} = $tr{PERSONA}{$lang};
		return ($tr{PERSONA}{$lang}, "Rule 0: [$tr{PERSONA}{$lang}] ($1)");
	}
	
	# These cases are common:
	# edward l. beach
	# bradford w. parkinson
	# edward c. stone
	if ( ( $master{$data[1]}{$parts[0]} || $master{$data[2]}{$parts[$#parts]} )
		&& $o =~ / [a-z]\. / ) {
		$heads{$head} = $tr{PERSONA}{$lang};
		return ($tr{PERSONA}{$lang}, "Rule MidName: [$tr{PERSONA}{$lang}]");
	}
		
	
	# if Charles Minnigerode has been detected earlier as a person, then it should
	# also be a person when it appears only as minnigerode
	if ($peoparts{$o}) {
		return ($tr{PERSONA}{$lang}, "peoparts rule");
	}

	# if the last element is in the PLACE list, like in "Staten Island", then it is a place
	if ((($head && $placard{$head}) || ($tail && $placard{$tail}))
	     	&& ($o !~ / / || !$master{$data[1]}{$parts[0]}) # controla si hay nombre de pila al principio
	) {
		return ($tr{LUGAR}{$lang}, "place card trigger");
		
	}


	# if it's a single word and there is a conflict between person and place
	# and this word does not appear in other names of persons that have appeared earlier
	# in the text, then it is a place
	if ( $o !~ / / && $master{$data[0]}{$o} && ($master{$data[1]}{$o} || $master{$data[2]}{$o} )  ) {
		my $abort;
		# this single word is not part of other multi-word expressions
		foreach my $check (keys %objects) {
			if ($check =~ / / && $check =~ /$o/ ) {
				$abort++;
				last;
			}
		}
		return ($tr{LUGAR}{$lang}, "mono place") unless $abort;
		
	}

	# if it's a multi-word expression and the first element is in the given names list
	# like "david cross" ("cross" may also be in the place list)
	# then it is a person 
	if ( $o =~ / / && $master{$data[1]}{$o} ) {
		return ($tr{PERSON}{$lang}, "poli pila");
	}
	

	# all other rules have failed	
	# so we analyze each component one by one
	
	foreach my $part (@parts) {
		if ($master{period}{$part}) {
			$tag{Periodo} .= " $part "; 
			$rank{Periodo}++;
		}
		
		if ($semtypes{$part}) {
			$tag{SemType} .= " $part ";
			$rank{SemType}++;
		}
		
		foreach my $i (0 .. 2) {
			if ($master{$data[$i]}{$part}) {
				$tag{$cols[$i+1]} .= " $part ";
				$rank{$cols[$i+1]}++;				
			}		
		}
	}

	if ($rank{Lugar} && $rank{Lugar} >= ($rank{Pila} + $rank{Surname})) {
		return ($tr{LUGAR}{$lang}, "Rule 2: [$tr{LUGAR}{$lang}] ($rank{Lugar} >= ($rank{Pila} + $rank{Surname})");
	} 
	
	if (($rank{Pila} + $rank{Surname}) > 1) { 
	
		# pending: at some point I decided to change the text to lower case letter
		# so now I cannot check if the names are written in uppers case letters, as they should:
		# && $text !~ /$o/ && $text =~ /$o/i ) {
		# I should keep an original version of the text to check this

		return ($tr{PERSONA}{$lang}, "Rule 2: [$tr{PERSONA}{$lang}] (($rank{Pila} + $rank{Surname}) > $rank{Lugar}");
	}
	
	# pending: I should check if the noun has a plural
	# because in that case it should not be a proper noun
	$head = $lemata{$head} if $lemata{$head};
	my $semtype = &tatatagger($head, $text);

	if ($semtype && $semtype ne " ? ") {
	
		# pending: I should use the triggers list instead of this
		if ($semtype =~ /^(point|area|punto|área|terreno|territorio)$/) {
			$semtype = $tr{LUGAR}{$lang};
		}
		
		if ($semtype =~ /^(human|man|woman|hombre|mujer|humano)$/) {
			$semtype = $tr{PERSONA}{$lang};
		}
		
		return (uc $semtype, "semantic type");
	}
	
	if (($rank{Pila}+$rank{Surname}) > 0) {
		if (($rank{Pila}+$rank{Surname}) > 1) {
			$heads{$head} = $tr{PERSONA}{$lang};
		}
		return ($tr{PERSONA}{$lang}, "Rule 3: [$tr{PERSONA}{$lang}] ($rank{Pila}+$rank{Surname})");
	}
	
	# ok, everything has failed 
	# the last resource is to take a look at the morphology
	# pending: this should be way larger

	my %minimorph = (
		'ness|ity|ncy' => "PROPERTY",
		'nomy|logy|graphy' => "STUDY", 
		'ist' => "PERSON",
		'ism' => "CONCEPT", # this could be better, 
				# like ideology
				# but there is no such type
				# in the coreont
		'ing' => "ACTIVITY",
		'shire|gton|town|wood' => "PLACE",
		'ción|miento' => "EVENTO",
		'nomía|logía|grafía' => "ESTUDIO",
	);

	foreach my $morf (keys %minimorph) {
		if ($head =~ /($morf)$/) {
			return ($minimorph{$morf}, "morphology: $1");
		}
	}
}


sub tatatagger {
	# this subroutine receives a noun and a context of occurrence
	# and it returns a semantic type or nothing
	my ($word, $sent) = @_;
	my $report = "\n[$word] "; 
	# this is a report of the decisions it made
	
	return $word if ($semtypes{$word});
	return " ? " if (!$senses{$word});

	my @tmp = keys %{$senses{$word}};
	my $totsns = scalar (@tmp);
	$report .= "\nSenses: $totsns\n";
	if ( $totsns == 1) {
		# we check if there is only one sense
		my ($back, $worp) =  &wopal($word, $tmp[0]);
		$report{$word} = $report." ". $worp; 
		return $back;
	}

	my %vect; # here we vectorize the definitions
	# pending: this should not be done here
	# all vectorization should be done once and before
	# the process begins
	chomp $sent;
	$sent = lc $sent;
	my @l = split /[$toksep]+/, $sent;
	foreach my $l (@l) {
		next if (!$l || $stop{$l} || $l =~ /[0-9]+/ );
		$vect{$l}++;
		if ($pulu{$l}) {
			$vect{$pulu{$l}}++;
		}
		if ($lema{$l}) {
			$vect{$lema{$l}}++;
		}
	}
		
	my %rank;

	foreach my $sense (sort keys %{$senses{$word}}) { 
		# Distance to sense: 
		next if ($sense eq $word);
		$report .= "\n\n*$sense* \n "; 
		foreach my $tokdef (sort keys %{$senses{$word}{$sense}}) {
			# first we go through all tokens in each definition/sense
			next if ($stop{$tokdef} 
			|| $tokdef eq $word
			|| $minimod{$tokdef} > $umbini 	
			
			); 
			
			# we check if there is some token in the context
			if ($vect{$tokdef}) {
				$report .= "\ndirect match: $tokdef  [$minimod{$tokdef}] *[$objects{$tokdef}]* " ;
				$rank{$sense} += 100; 
				# pending: this numbers are arbitrary
				# more experimentation should be done to decide what is the right weight
				if ($tokdef eq $sense) {
					$report .= "\n* Matches sense *";
					$rank{$sense} += 100;
				}
			}	
				
			if ($ponder{$tokdef}) {
				# here we check the associations of this word
				# in order to see if we find something
				foreach my $cooc (keys %{$ponder{$tokdef}}) {
					next if ($cooc eq $word);
					if ($vect{$cooc}
						&& $ponder{$tokdef}{$cooc} > 1.4
						&& $minimod{$cooc} < $umbini
					) {
						if ($lang eq "es") {
						# pending: the models are incompatible
						# and that is why I have to divert like this in the
						# case of Spanish
						# I should regenerate the Spanish model to make it 
						# identical to the English one
							my $pond = 100 * 
						($ponder{$tokdef}{$cooc} / ($disp{$tokdef} + $disp{$cooc}));
							next if ($pond < .001);
							$rank{$sense} += 1+ $pond;
						} else {			
							$rank{$sense} += $ponder{$tokdef}{$cooc};
						}

						if ($cooc eq $sense) {
							$report .= 
							"\nSemantic category: $sense [$minimod{$sense}] $rank{$sense}";
							$rank{$sense} += 50;
						}
					}				
				}
			}				
		}
	}
	
	if (!scalar keys %rank) {
		$report{$word} = $report;
		return ("* No result *");
	}
	$report .= "\n * FINAL RANK * for $word";
	my @r = sort { $rank{$b} <=> $rank{$a} } keys %rank;
	foreach my $r (@r) {
		$rank{$r} = sprintf "%.2f", $rank{$r};
		$report .= "\n$r: $rank{$r}";
	}
	
	$report .= "\n\nwop: ($word, $r[0])"; 
	my ($wop, $worp) = &wopal($word, $r[0]);
	
	return if ($wop =~ /^($nonos)$/);# the semantic type is not usefull
	
	$report{$word} = $report." ". $worp;
	return ($wop);

}

sub wopal {
	# This subroutine receives a noun
	# and checks if it is included as a semantic type
	# if it is not, it must go up in the taxonomy
	# to try to find one
	my ($word, $in) = @_;
	return if (!$in);
	my $worp = ""; # wopal report
	if (!$semtypes{$in}) {
		my $sca = scalar (@{$chain{$word}{$in}});
		$worp .= "\nClimbing ($sca)... $word: $in ~~~ ";
		foreach my $step (@{$chain{$word}{$in}}) {
			$worp .= "\n $step | ";
			if ($semtypes{$step}) {
				$worp .=  "\n ***$step: Found***";	
				$in = $step;
				last;
			}	
		}	
	}
	return ($in, $worp);
}

