forked from geckom/ChatScript
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdictionarySystem.cpp
7862 lines (7275 loc) · 251 KB
/
dictionarySystem.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include "common.h"
#ifdef INFORMATION
This file covers routines that create and access a "dictionary entry" (WORDP) and the "meaning" of words(MEANING).
The dictionary acts as the central hash mechanism for accessing various kinds of data.
The dictionary consists of data imported from WORDNET 3.0 (copyright notice at end of file) + augmentations + system and script pseudo - words.
A word also gets the WordNet meaning ontology(->meanings &->meaningCount).The definition of meaning in WordNet
is words that are synonyms in some particular context.Such a collection in WordNet is called a synset.
Since words can have multiple meanings(and be multiple parts of speech), the flags of a word are a summary
of all of the properties it might have and it has a list of entries called "meanings".Each entry is a MEANING
and points to the circular list, one of which marks the word you land at as the synset head.
This is referred to as the "master" meaning and has the gloss(definition) of the meaning.The meaning list of a master node points back to
all the real words which comprise it.
Since WordNet has an ontology, its synsets are hooked to other synsets in various relations, particular that
of parent and child.ChatScript represents these as facts.The hierarchy relation uses the verb "is" and
has the child as subject and the parent as object.Such a fact runs from the master entries, not any of the actual
word entries.So to see if "dog" is an "animal", you could walk every meaning list of the word animal and
mark the master nodes they point at.Then you would search every meaning of dog, jumping to the master nodes,
then look at facts with the master node as subject and the verb "is" and walk up to the object.If the object is
marked, you are there.Otherwise you take that object node as subject and continue walking up.Eventually you arrive at
a marked node or run out at the top of the tree.
Some words DO NOT have a master node.Their meaning is defined to be themselves(things like pronouns or determiners), so
their meaning value for a meaning merely points to themselves.
The meaning system is established by building the dictionary and NEVER changes thereafter by chatbot execution.
New words can transiently enter the dictionary for various purposes, but they will not have "meanings".
A MEANING is a reference to a specific meaning of a specific word.It is an index into the dictionary
(identifying the word) and an index into that words meaning list(identifying the specific meaning).
An meaning list index of 0 refers to all meanings of the word.A meaning index of 0 can also be type restricted
so that it only refers to noun, verb, adjective, or adverb meanings of the word.
Since there are only two words in WordNet with more than 63 meanings(break and cut) we limit all words to having no
more than 63 meanings by discarding the excess meanings.Since meanings are stored most important first,
these are no big loss.This leaves room for the 5 essential type flags used for restricting a generic meaning.
Space for dictionary words comes from a common pool.Dictionary words are
allocated linearly forward in the pool.Strings have their own pool(the heap).
All dictionary entries are indexable as a giant array.
The dictionary separately stores uppercase and lowercase forms of the same words(since they have different meanings).
There is only one uppercase form stored, so United and UnItED would be saved as one entry.The system will have
to decide which case a user intended, since they may not have bothered to capitalize a proper noun, or they
may have shouted a lowercase noun, and a noun at the start of the sentence could be either proper or not.
Dictionary words are hashed as lower case, but if the word has an upper case letter it will be stored
in the adjacent higher bucket.Words of the basic system are stored in their appropriate hash bucket.
After the basic system is read in, the dictionary is frozen.This means it remembers the spots the allocation
pointers are at for the dictionary and heap space and is using mark - release memory management.
The hash buckets are themselves dictionary entries, sharing space.After the basic layers are loaded,
new dictionary entries are always allocated.Until then, if the word hashs to an empty bucket, that bucket
becomes the dictionary entry being added.
We mark sysnet entries with the word & meaning number & POS of word in the dictionary entry.The POS not used explicitly by lots of the system
but is needed when seeing the dictionary definitions(:word) and if one wants to use pos - restricted meanings in a match or in keywords.
#endif
int worstDictAvail = 1000000;
bool dictionaryBitsChanged = false;
unsigned int propertyRedefines = 0; // property changes on locked dictionary entries
unsigned int flagsRedefines = 0; // systemflags changes on locked dictionary entries
static int freeTriedList = 0;
bool xbuildDictionary = false; // indicate when building a dictionary
char dictionaryTimeStamp[20]; // indicate when dictionary was built
char* mini = "";
unsigned int* hashbuckets = 0;
static unsigned int rawWords = 0;
static unsigned char* writePtr; // used for binary dictionary writes
// memory data
unsigned long maxHashBuckets = MAX_HASH_BUCKETS;
bool setMaxHashBuckets = false;
uint64 maxDictEntries = MAX_DICTIONARY;
MEANING posMeanings[64]; // concept associated with propertyFlags of WORDs
MEANING sysMeanings[64]; // concept associated with systemFlags of WORDs
#include <map>
using namespace std;
std::map <WORDP, WORDP> irregularNouns;
std::map <WORDP, WORDP> irregularVerbs;
std::map <WORDP, WORDP> irregularAdjectives;
std::map <WORDP, WORDP> canonicalWords;
std::map <WORDP, int> wordValues; // per volley
std::map <WORDP, MEANING> backtracks; // per volley
std::map <WORDP, int> triedData; // per volley index into heap space
int concepts[MAX_SENTENCE_LENGTH]; // concept chains per word
int topics[MAX_SENTENCE_LENGTH]; // topics chains per word
bool fullDictionary = true; // we have a big master dictionary, not a mini dictionary
bool primaryLookupSucceeded = false;
void LoadRawDictionary(int mini);
bool TraceHierarchyTest(int x)
{
if (!(x & TRACE_HIERARCHY)) return false;
x &= -1 ^ (TRACE_ON|TRACE_ALWAYS|TRACE_HIERARCHY|TRACE_ECHO);
return x == 0; // must exactly have been trace hierarchy
}
MEANING GetMeaning(WORDP D, int index)
{
MEANING* meanings = GetMeanings(D);
if (meanings) return meanings[index];
else return MakeMeaning(D); // switch to generic non null meaning
}
void RemoveConceptTopic(int list[256], WORDP D,int index)
{
MEANING M = MakeMeaning(D);
int at = list[index];
MEANING* prior = NULL;
while (at)
{
MEANING* mlist = (MEANING*) Index2Heap(at);
if (M == mlist[0])
{
if (prior == NULL) list[index] = mlist[1]; // list head
else prior[1] = mlist[1];
return; // assumed only on list once
}
prior = mlist;
at = mlist[1];
}
}
void Add2ConceptTopicList(int list[256], WORDP D,int start,int end,bool unique)
{
MEANING M = MakeMeaning(D);
if (unique)
{
int at = list[start];
while (at)
{
MEANING* mlist = (MEANING*) Index2Heap(at);
if (M == mlist[0]) return; // already on list
at = mlist[1];
}
}
unsigned int* entry = (unsigned int*) AllocateHeap(NULL,2, sizeof(MEANING),false); // ref and link for topics and concepts indexed by word
entry[1] = list[start];
list[start] = Heap2Index((char*) entry);
entry[0] = M;
// entry[2] = (start << 8) | end; // currently unused because we use dictionary marks.
}
void ClearVolleyWordMaps()
{
wordValues.clear();
backtracks.clear();
savedSentences = 0;
ClearWhereInSentence();
ClearTriedData(); // prevent document reuse
userVariableThreadList = 0;
}
void ClearWordMaps() // both static for whole dictionary and dynamic per volley
{
irregularNouns.clear();
irregularVerbs.clear();
irregularAdjectives.clear();
canonicalWords.clear();
triedData.clear(); // prevent document reuse
ClearVolleyWordMaps();
}
void ClearWordWhere(WORDP D,int at)
{
triedData.erase(D);
// also remove from concepts/topics lists
if (at == -1)
{
for (int i = 1; i <= wordCount; ++i)
{
RemoveConceptTopic(concepts,D,i);
RemoveConceptTopic(topics,D,i);
}
}
}
void ClearWhereInSentence() // erases the WHEREINSENTENCE and the TRIEDBITS
{
memset(concepts,0, sizeof(unsigned int) * MAX_SENTENCE_LENGTH);
memset(topics,0, sizeof(unsigned int) * MAX_SENTENCE_LENGTH);
// be able to reuse memory
if (documentMode) for (std::map<WORDP,int>::iterator it=triedData.begin(); it!=triedData.end(); ++it)
{
MEANING* data = (MEANING*) Index2Heap(it->second);
*data = freeTriedList;
freeTriedList = it->second;
}
triedData.clear();
memset(unmarked,0,MAX_SENTENCE_LENGTH);
}
void ClearTriedData() // erases the WHEREINSENTENCE and the TRIEDBITS
{
triedData.clear();
freeTriedList = 0;
}
void SetFactBack(WORDP D, MEANING M)
{
if (GetFactBack(D) == 0)
{
backtracks[D] = M;
}
}
MEANING GetFactBack(WORDP D)
{
std::map<WORDP,MEANING>::iterator it;
it = backtracks.find(D);
return (it != backtracks.end()) ? it->second : 0;
}
void ClearBacktracks()
{
backtracks.clear();
}
unsigned char* GetWhereInSentence(WORDP D) // [0] is the meanings bits, the rest are start/end/case bytes for 8 locations
{
std::map<WORDP,int>::iterator it;
it = triedData.find(D);
if (it == triedData.end()) return NULL;
char* data = Index2Heap(it->second);
if (data == 0) return NULL;
return (unsigned char*) (data + 8); // skip over 64bit tried by meaning field
}
unsigned int* AllocateWhereInSentence(WORDP D)
{
if (documentMode && freeTriedList) // reuse memory
{
MEANING* d = (MEANING*) Index2Heap(freeTriedList);
freeTriedList = *d;
}
size_t len = (sizeof(uint64) + maxRefSentence + 3)/4;
// 64bit tried by meaning field (aligned) + sentencerefs (2 bytes each + a byte for uppercase index)
unsigned int* data = (unsigned int*) AllocateHeap(NULL,len,4,false); // 64 bits (2 words) + 48 bytes (12 words) = 14 words
if (!data) return NULL;
memset((char*)data,0xff,len*4); // clears sentence xref start/end bits and casing byte
*data = 0; // clears the tried meanings list
data[1] = 0;
// store where in the temps data
int index = Heap2Index((char*) data); // original index!
triedData[D] = index;
return data + 2; // analogous to GetWhereInSentence
}
void SetTriedMeaning(WORDP D,uint64 bits)
{
unsigned int* data = (unsigned int*) GetWhereInSentence(D);
if (!data)
{
data = AllocateWhereInSentence(D);
if (!data) return; // failed to allocate
}
*(data - 2) = (unsigned int) (bits >> 32);
*(data - 1) = (unsigned int) (bits & 0xffffffff); // back up to the tried meaning area
}
uint64 GetTriedMeaning(WORDP D) // which meanings have been used (up to 64)
{
std::map<WORDP,int>::iterator it;
it = triedData.find(D);
if (it == triedData.end()) return 0;
unsigned int* data = (unsigned int*)Index2Heap(it->second); // original location
if (!data) return 0;
uint64 value = ((uint64)(data[0])) << 32;
value |= (uint64)data[1];
return value; // back up to the correct meaning zone
}
void SetPlural(WORDP D,MEANING M)
{
irregularNouns[D] = dictionaryBase + M; // must directly assign since word on load may not exist
}
void SetComparison(WORDP D,MEANING M)
{
irregularAdjectives[D] = dictionaryBase + M; // must directly assign since word on load may not exist
}
void SetTense(WORDP D,MEANING M)
{
irregularVerbs[D] = dictionaryBase + M; // must directly assign since word on load may not exist
}
void SetCanonical(WORDP D,MEANING M)
{
canonicalWords[D] = dictionaryBase + M; // must directly assign since word on load may not exist
}
WORDP RawCanonical(WORDP D)
{
std::map<WORDP, WORDP>::iterator it;
it = canonicalWords.find(D);
if (it == canonicalWords.end()) return NULL;
return it->second;
}
WORDP GetCanonical(WORDP D)
{
std::map<WORDP, WORDP>::iterator it;
it = canonicalWords.find(D);
if (it == canonicalWords.end()) return NULL;
WORDP E = it->second;
if (*E->word != '`') return E; // normal english canonical
// foreign canonicals have multiple choices. This code only picks first one. Other code may decide more complex, but generally treetagger will do that
char word[MAX_WORD_SIZE];
strcpy(word, E->word + 1);
char* end = strchr(word, '`');
*end = 0;
end = strchr(word, '|'); // multiple choice?
if (end) *end = 0; // pick 1st one
return StoreWord(word, AS_IS);
}
WORDP GetTense(WORDP D)
{
std::map<WORDP,WORDP>::iterator it;
it = irregularVerbs.find(D);
return (it != irregularVerbs.end()) ? it->second : NULL;
}
WORDP GetPlural(WORDP D)
{
std::map<WORDP,WORDP>::iterator it;
it = irregularNouns.find(D);
return (it != irregularNouns.end()) ? it->second : NULL;
}
WORDP GetComparison(WORDP D)
{
std::map<WORDP,WORDP>::iterator it;
it = irregularAdjectives.find(D);
return (it != irregularAdjectives.end()) ? it->second : NULL;
}
void SetWordValue(WORDP D, int x)
{
wordValues[D] = x;
}
int GetWordValue(WORDP D)
{
std::map<WORDP,int>::iterator it;
it = wordValues.find(D);
return (it != wordValues.end()) ? it->second : 0;
}
// start and ends of space allocations
WORDP dictionaryBase = 0; // base of allocated space that encompasses dictionary, heap space, and meanings
WORDP dictionaryFree; // current next dict space available going forward (not a valid entry)
// return-to values for layers
WORDP dictionaryPreBuild[NUMBER_OF_LAYERS+1];
char* stringsPreBuild[NUMBER_OF_LAYERS+1];
// build0Facts
// return-to values after build1 loaded, before user is loaded
WORDP dictionaryLocked;
FACT* factLocked = 0;
char* stringLocked;
// format of word looked up
uint64 verbFormat;
uint64 nounFormat;
uint64 adjectiveFormat;
uint64 adverbFormat;
// dictionary ptrs for these words
WORDP Dplacenumber;
WORDP Dpropername;
MEANING Mphrase;
MEANING MabsolutePhrase;
MEANING MtimePhrase;
WORDP Dclause;
WORDP Dverbal;
WORDP Dmalename,Dfemalename,Dhumanname;
WORDP Dtime;
WORDP Dunknown;
WORDP Dchild,Dadult;
WORDP Dtopic;
MEANING Mchatoutput;
MEANING Mburst;
MEANING Mpending;
MEANING Mkeywordtopics;
MEANING Mconceptlist;
MEANING Mmoney;
MEANING Mintersect;
MEANING MgambitTopics;
MEANING MadjectiveNoun;
MEANING Mnumber;
WORDP Dpronoun;
WORDP Dadjective;
WORDP Dauxverb;
WORDP DunknownWord;
static char* predefinedSets[] = // some internally mapped concepts not including emotions from LIVEDATA/interjections
{
(char*)"~repeatme",(char*)"~repeatinput1",(char*)"~repeatinput2",(char*)"~repeatinput3",(char*)"~repeatinput4",(char*)"~repeatinput5",(char*)"~repeatinput6",(char*)"~uppercase",(char*)"~utf8",(char*)"~sentenceend",
(char*)"~pos",(char*)"~sys",(char*)"~grammar_role",(char*)"~daynumber",(char*)"~yearnumber",(char*)"~dateinfo",(char*)"~formatteddate",(char*)"~email_url",(char*)"~fahrenheit",(char*)"~celsius",(char*)"~kelvin",
(char*)"~kindergarten",(char*)"~grade1_2",(char*)"~grade3_4",(char*)"~grade5_6",(char*)"~twitter_name",(char*)"~hashtag_label",
(char*)"~shout",(char*)"~distance_noun_modify_adverb",
(char*)"~distance_noun_modify_adjective",(char*)"~modelnumber",
(char*)"~time_noun_modify_adverb",
(char*)"~time_noun_modify_adjective",
NULL
};
void DictionaryRelease(WORDP until,char* stringUsed)
{
WORDP D;
while (propertyRedefines) // must release these
{
unsigned int * at = (unsigned int *) Index2Heap(propertyRedefines); // 1st is index of next, 2nd is index of dict, 3/4 is properties to replace
if (!at) // bug - its been killed
{
propertyRedefines = 0;
break;
}
if ((char*)at >= stringUsed) break; // not part of this freeing
propertyRedefines = *at;
D = Index2Word(at[1]);
D->properties = *((uint64*) (at+2));
}
while (flagsRedefines) // must release these
{
unsigned int * at = (unsigned int*) Index2Heap(flagsRedefines); // 1st is index of next, 2nd is index of dict, 3/4 is properties to replace
if (!at) // bug - its been killed
{
flagsRedefines = 0;
break;
}
if ((char*)at >= stringUsed) break; // not part of this freeing
flagsRedefines = *at;
D = Index2Word(at[1]);
D->systemFlags = *((uint64*) (at+2));
}
if (until) while (dictionaryFree > until) DeleteDictionaryEntry(--dictionaryFree); // remove entry from buckets
heapFree = stringUsed;
}
char* UseDictionaryFile(char* name)
{
static char junk[100];
if (*mini) sprintf(junk,(char*)"DICT/%s",mini);
else if (!*language) sprintf(junk,(char*)"%s",(char*)"DICT");
else if (!name) sprintf(junk,(char*)"DICT/%s",language);
else sprintf(junk,(char*)"DICT/%s",language);
MakeDirectory(junk); // if it doesnt exist
if (name && *name)
{
strcat(junk,(char*)"/");
strcat(junk,name);
}
return junk;
}
MEANING FindChild(MEANING who,int n)
{ // GIVEN SYNSET
FACT* F = GetObjectNondeadHead(who);
unsigned int index = Meaning2Index(who);
while (F)
{
FACT* at = F;
F = GetObjectNondeadNext(F);
if (at->verb != Mis) continue;
if (index && at->object != who) continue; // not us
if (--n == 0) return at->subject;
}
return 0;
}
bool ReadForeignPosTags(char* fname)
{
FILE* in = FopenReadOnly(fname);
if (!in) return false;
char word[MAX_WORD_SIZE];
char wordx[MAX_WORD_SIZE];
*word = '_'; // marker to keep any collision away from foreign pos
*wordx = '~'; // corresponding concept
while (ReadALine(readBuffer, in) >= 0) // foreign name followed by bits of english pos
{
char* ptr = ReadCompiledWord(readBuffer, word + 1);
if (!word[1] || word[1] == '#')
continue;
uint64 flags = 0;
char flag[MAX_WORD_SIZE];
while (*ptr) // get english pos values of this tag
{
ptr = ReadCompiledWord(ptr, flag);
if (!*flag || *flag == '#') break;
uint64 val = FindValueByName(flag);
if (!val)
{
(*printer)("Unable to find flag %s\r\n", flag);
}
flags |= val;
}
StoreWord(word, flags); // _foreignpos gets bits
MakeLowerCopy(wordx + 1, word + 1); // force the name to lowercase because the name might not be valid, e.g. DET:ART
BUILDCONCEPT(wordx);
}
fclose(in);
// Also store the unknown tag, just in case the pos tagger fails to attach anything
strcpy(word + 1, (char*)"unknown-tag");
StoreWord(word, 0);
*word = '~'; // corresponding concept
BUILDCONCEPT(word);
return true;
}
unsigned char BitCount(uint64 n)
{
unsigned char count = 0;
while (n)
{
count++;
n &= (n - 1);
}
return count;
}
WORDP GetSubstitute(WORDP D)
{
return (D && D->internalBits & HAS_SUBSTITUTE) ? D->w.substitutes : 0;
}
void BuildShortDictionaryBase();
static void EraseFile(char* file)
{
FILE* out = FopenUTF8Write(UseDictionaryFile(file));
FClose(out);
}
void ClearDictionaryFiles()
{
char buffer[MAX_WORD_SIZE];
int ans = remove(UseDictionaryFile((char*)"dict.bin"));
remove(UseDictionaryFile((char*)"facts.bin"));
EraseFile((char*)"dict.bin"); // create but empty file
EraseFile((char*)"facts.bin"); // create but empty file
unsigned int i;
for (i = 'a'; i <= 'z'; ++i)
{
sprintf(buffer,(char*)"%c.txt",i);
EraseFile(buffer); // create but empty file
}
for (i = '0'; i <= '9'; ++i)
{
sprintf(buffer,(char*)"%c.txt",i);
EraseFile(buffer); // create but empty file
}
}
void BuildDictionary(char* label)
{
xbuildDictionary = true;
int miniDict = 0;
char word[MAX_WORD_SIZE];
mini = language;
char* ptr = ReadCompiledWord(label,word);
bool makeBaseList = false;
if (!stricmp(word,(char*)"wordnet")) // the FULL wordnet dictionary w/o synset removal
{
miniDict = -1;
ReadCompiledWord(ptr,word);
mini = "WORDNET";
}
else if (!stricmp(word,(char*)"basic"))
{
miniDict = 1;
makeBaseList = true;
FClose(FopenUTF8Write((char*)"RAWDICT/basicwordlist.txt"));
maxHashBuckets = 10000;
setMaxHashBuckets = true;
mini = "BASIC";
}
else if (!stricmp(word,(char*)"layer0") || !stricmp(word,(char*)"layer1")) // a mini dictionary
{
miniDict = !stricmp(word,(char*)"layer0") ? 2 : 3;
mini = (miniDict == 2) ? (char*)"LAYER0" : (char*)"LAYER1";
maxHashBuckets = 10000;
setMaxHashBuckets = true;
}
else if (stricmp(language, (char*)"english") ) // a foreign dictionary
{
miniDict = 6;
maxHashBuckets = 10000;
setMaxHashBuckets = true;
}
UseDictionaryFile(NULL);
InitStackHeap();
InitDictionary();
InitFacts();
InitCache();
LoadRawDictionary(miniDict);
if (miniDict && miniDict != 6) StoreWord((char*)"minidict"); // mark it as a mini dictionary
// dictionary has been built now
(*printer)((char*)"%s",(char*)"Dumping dictionary\r\n");
ClearDictionaryFiles();
WalkDictionary(WriteDictionary);
if (makeBaseList) BuildShortDictionaryBase(); // write out the basic dictionary
remove(UseDictionaryFile((char*)"dict.bin")); // invalidate cache of dictionary, forcing binary rebuild later
WriteFacts(FopenUTF8Write(UseDictionaryFile((char*)"facts.txt")),factBase);
sprintf(logFilename,(char*)"%s/build_log.txt",users); // all data logged here by default
FILE* out = FopenUTF8Write(logFilename);
FClose(out);
(*printer)((char*)"dictionary dump complete %d\r\n",miniDict);
echo = true;
xbuildDictionary = false;
CreateSystem();
}
void InitDictionary()
{
// read what the default dictionary wants as hash if parameter didnt set it
if (!setMaxHashBuckets)
{
FILE* in = FopenStaticReadOnly(UseDictionaryFile((char*)"dict.bin")); // DICT
if (in)
{
maxHashBuckets = Read32(in); // bucket size used by dictionary file
FClose(in);
}
}
dictionaryLocked = 0;
userTopicStoreSize = userCacheCount * (userCacheSize+2); // minimum file cache spot
userTopicStoreSize /= 64;
userTopicStoreSize = (userTopicStoreSize * 64) + 64;
// dictionary and meanings
size_t size = (size_t)(sizeof(WORDENTRY) * maxDictEntries);
size /= sizeof(WORDENTRY);
size = (size * sizeof(WORDENTRY)) + sizeof(WORDENTRY);
size /= 64;
size = (size * 64) + 64;
// on FUTURE startups (not 1st) the userCacheCount has been preserved while the rest of the system is reloaded
if ( dictionaryBase == 0) // 1st startup allocation -- not needed on a reload
{
dictionaryBase = (WORDP) malloc(size);
}
#ifdef EXPLAIN
Conjoined heap space Independent heap space
heapBase -- allocates downwards
heapBase - allocates downwards heapFree
heapFree heapEnd -- stackFree - allocates upwards
-------------------------disjoint memory
dictionaryFree
dictionaryBase - allocates upwards
userTopicStore 1..n
userTable refers to userTopicStore
cacheBase
#endif
memset(dictionaryBase,0,size);
dictionaryFree = dictionaryBase + 1; // dont write on 0
dictionaryPreBuild[LAYER_0] = dictionaryPreBuild[LAYER_1] = dictionaryPreBuild[LAYER_BOOT] = 0; // in initial dictionary
factsPreBuild[LAYER_0] = factsPreBuild[LAYER_1] = factsPreBuild[LAYER_BOOT] = factFree; // last fact in dictionary
propertyRedefines = flagsRedefines = 0;
hashbuckets = (unsigned int*)AllocateHeap(0, (maxHashBuckets+1), sizeof(int)); // +1 for the upper case hash
memset(hashbuckets, 0, (maxHashBuckets+1) * sizeof(int));
}
void AddInternalFlag(WORDP D, unsigned int flag)
{
if (flag && flag != D->internalBits) // prove there is a change - dont use & because if some bits are set is not enough
{
if (D < dictionaryLocked)
return;
D->internalBits |= (unsigned int) flag;
}
}
void RemoveInternalFlag(WORDP D,unsigned int flag)
{
D->internalBits &= -1 ^ flag;
}
static void PreserveSystemFlags(WORDP D)
{
unsigned int* at = (unsigned int*) AllocateHeap (NULL,2, sizeof(uint64),false); // PreserveSystemFlags
*at = flagsRedefines;
at[1] = Word2Index(D);
flagsRedefines = Heap2Index((char*)at);
*((uint64*)(at+2)) = D->systemFlags;
}
void AddSystemFlag(WORDP D, uint64 flag)
{
if (flag & NOCONCEPTLIST && *D->word != '~') flag ^= NOCONCEPTLIST; // not allowed to mark anything but concepts with this
if (flag && flag != (D->systemFlags & flag)) // prove there is a change - dont use & because if some bits are set is not enough
{
if (D < dictionaryLocked) PreserveSystemFlags(D);
D->systemFlags |= flag;
}
}
void AddParseBits(WORDP D, unsigned int flag)
{
if (flag && flag != D->parseBits) // prove there is a change - dont use & because if some bits are set is not enough
{
if (D < dictionaryLocked)
return;
D->parseBits |= (unsigned int) flag;
}
}
void RemoveSystemFlag(WORDP D, uint64 flags)
{
if (D->systemFlags & flags)
{
if (D < dictionaryLocked) PreserveSystemFlags(D);
D->systemFlags &= -1LL ^ flags;
}
}
static void PreserveProperty(WORDP D)
{
unsigned int* at = (unsigned int*) AllocateHeap (NULL,2, sizeof(uint64),false); // PreserveProperty
*at = propertyRedefines;
at[1] = Word2Index(D);
propertyRedefines = Heap2Index((char*)at);
*((uint64*)(at+2)) = D->properties;
}
void AddProperty(WORDP D, uint64 flag)
{
if (flag && flag != (D->properties & flag))
{
if (D < dictionaryLocked) PreserveProperty(D);
if (!(D->properties & (PART_OF_SPEECH)) && flag & PART_OF_SPEECH && *D->word != '~' && *D->word != '^' && *D->word != USERVAR_PREFIX && flag != CURRENCY) // not topic,concept,function
{
// internal use, do not allow idioms on words from #defines or user variables or sets.. but allow substitutes to do it?
unsigned int n = BurstWord(D->word);
if (n != 1)
{
WORDP E = StoreWord(JoinWords(1)); // create the 1-word header
if (n > GETMULTIWORDHEADER(E)) SETMULTIWORDHEADER(E,n); // mark it can go this far for an idiom
}
}
D->properties |= flag;
}
}
void RemoveProperty(WORDP D, uint64 flags)
{
if (D->properties & flags)
{
if (D < dictionaryLocked) PreserveProperty(D);
D->properties &= -1LL ^ flags;
}
}
bool StricmpUTF(char* w1, char* w2, int len)
{
unsigned char c1, c2;
unsigned char* word1 = (unsigned char*)w1;
unsigned char* word2 = (unsigned char*)w2;
while (len && *word1 && *word2)
{
if (*word1 == 0xc3) // utf8 case sensitive
{
c1 = *word1;
c2 = *word2;
if (c1 == c2);
else if (c1 >= 0x9f && c1 <= 0xbf) // lower case form
{
if (c2 >= 0x80 && c2 <= 0x9e && c1 != 0x9f && c1 != 0xb7) // uppercase form
{
c1 -= 0xa0;
c1 += 0x80;
}
if (c1 != c2) return true;
}
else if (c2 >= 0x9f && c2 <= 0xbf) // lower case form
{
if (c1 >= 0x80 && c1 <= 0x9e && c2 != 0x9f && c2 != 0xb7) // uppercase form
{
c2 -= 0xa0;
c2 += 0x80;
}
if (c1 != c2) return true;
}
}
else if (word1[1] >= 0xc4 && word1[1] <= 0xc9)
{
c1 = *word1;
c2 = *word2;
if (c1 == c2);
else if (c1 & 1 && (c1 & 0xf7) == c2); // lower case form
else if (c2 & 1 && (c2 & 0xf7) == c1);
else return true;
}
else if (toLowercaseData[*word1] != toLowercaseData[*word2]) return true;
int size = UTFCharSize((char*)word1);
word1 += size;
word2 += size;
len -= size;
}
if (len == 0) return false; // if matched all characters we needed to, then we are done
return *word1 || *word2;
}
int GetWords(char* word, WORDP* set, bool strictcase)
{
int index = 0;
size_t len = strlen(word);
if (len >= MAX_WORD_SIZE) return 0; // not legal
char commonword[MAX_WORD_SIZE];
strcpy(commonword, word);
char* at = commonword;
while ((at = strchr(at, ' '))) *at = '_'; // match as underscores whenever spaces occur (hash also treats them the same)
bool hasUpperCharacters;
bool hasUTF8Characters;
uint64 fullhash = Hashit((unsigned char*)word, len, hasUpperCharacters, hasUTF8Characters); // sets hasUpperCharacters and hasUTF8Characters
unsigned int hash = (fullhash % maxHashBuckets); // mod by the size of the table
// lowercase bucket
WORDP D = dictionaryBase + hashbuckets[hash];
char word1[MAX_WORD_SIZE];
if (strictcase && hasUpperCharacters) D = dictionaryBase; // lower case not allowed to match uppercase input
while (D != dictionaryBase)
{
if (fullhash == D->hash && D->length == len)
{
strcpy(word1, D->word);
at = word1;
while ((at = strchr(at, ' '))) *at = '_'; // match as underscores whenever spaces occur (hash also treats them the same)
if (!StricmpUTF(word1, commonword, len)) set[index++] = D;
}
D = dictionaryBase + GETNEXTNODE(D);
}
// upper case bucket
D = dictionaryBase + hashbuckets[hash + 1];
if (strictcase && !hasUpperCharacters) D = dictionaryBase; // upper case not allowed to match lowercase input
while (D != dictionaryBase)
{
if (fullhash == D->hash && D->length == len)
{
strcpy(word1, D->word);
at = word1;
while ((at = strchr(at, ' '))) *at = '_'; // match as underscores whenever spaces occur (hash also treats them the same)
if (!StricmpUTF(word1, commonword, len)) set[index++] = D;
}
D = dictionaryBase + GETNEXTNODE(D);
}
return index;
}
int UTFCharSize(char* utf)
{
unsigned int count; // bytes of character
unsigned char c = (unsigned char)*utf;
if (!(c & 0x80)) count = 1; // ordinary ascii char
else if ((c & 0xE0) == 0xC0) count = 2; // 110 0xc0
else if ((c & 0Xf0) == 0Xe0) count = 3; // 1110 0xE0
else if ((c & 0Xf8) == 0XF0) count = 4; // 11110 0xF0
else count = 1;
return count;
}
WORDP FindWord(const char* word, int len,uint64 caseAllowed)
{
if (word == NULL || *word == 0) return NULL;
if (len == 0) len = strlen(word);
if (len == 0) return NULL;
bool hasUpperCharacters;
bool hasUTF8Characters;
uint64 fullhash = Hashit((unsigned char*) word,len,hasUpperCharacters,hasUTF8Characters); // sets hasUpperCharacters and hasUTF8Characters
unsigned int hash = (fullhash % maxHashBuckets); // mod by the size of the table
if (caseAllowed & LOWERCASE_LOOKUP){;} // stay in lower bucket regardless
// rule label (topic.name) uses uppercase lookup
else if (*word == SYSVAR_PREFIX || *word == USERVAR_PREFIX || (*word == '~' && !strchr(word, '.')) || *word == '^')
{
if (caseAllowed == UPPERCASE_LOOKUP) return NULL; // not allowed to find
caseAllowed = LOWERCASE_LOOKUP; // these are always lower case
}
else if (hasUpperCharacters || (caseAllowed & UPPERCASE_LOOKUP)) ++hash;
// you can search on upper or lower specifically (not both) or primary or secondary or both
// normal or fixed case bucket
WORDP D;
primaryLookupSucceeded = true;
if (caseAllowed & (PRIMARY_CASE_ALLOWED|LOWERCASE_LOOKUP|UPPERCASE_LOOKUP))
{
D = Index2Word(hashbuckets[hash]);
WORDP almost = NULL;
WORDP preferred = NULL;
WORDP exact = NULL;
while (D != dictionaryBase)
{
if (fullhash == D->hash && D->length == len && !StricmpUTF(D->word,(char*)word,len)) // they match independent of case-
{
if (caseAllowed == LOWERCASE_LOOKUP) return D; // incoming word MIGHT have uppercase letters but we will be in lower bucket
else if (hasUpperCharacters) // we are looking for uppercase or primary case and are in uppercase bucket
{
if (D->internalBits & PREFER_THIS_UPPERCASE) preferred = D;
else if (!strncmp(D->word,word,len)) exact = D; // exactly what we want in upper case
else almost = D; // remember semi-match in upper case
}
else // if uppercase lookup, we are in uppercase bucket (input was lower case) else we are in lowercase bucket
{
return D; // it is exactly what we want in lower case OR its an uppercase acceptable match
}
}
D = dictionaryBase + GETNEXTNODE(D);
}
if (preferred) return preferred;
else if (exact) return exact;
else if (almost) return almost; // uppercase request we found in a different form only
}
// alternate case bucket (checking opposite case)
primaryLookupSucceeded = false;
if (caseAllowed & SECONDARY_CASE_ALLOWED)
{
WORDP almost = NULL;
WORDP preferred = NULL;
WORDP exact = NULL;
D = dictionaryBase + hashbuckets[hash + ((hasUpperCharacters) ? -1 : 1)];
while (D != dictionaryBase)
{
if (fullhash == D->hash && D->length == len && !StricmpUTF(D->word, (char*)word, len))
{
if (hasUpperCharacters) return D; // lowercase form
if (D->internalBits & PREFER_THIS_UPPERCASE) preferred = D;
else almost = D; // remember a match in upper case
}
D = dictionaryBase + GETNEXTNODE(D);
}
if (preferred) return preferred;
else if (exact) return exact;
else if (almost) return almost; // uppercase request we found in a different form only
}
return NULL;
}
static WORDP AllocateEntry()
{
WORDP D = dictionaryFree++;
int index = Word2Index(D);
int avail = maxDictEntries - index;
if (avail <= 0)
{
ReportBug((char*)"FATAL: used up all dict nodes\r\n")
}
if (avail < worstDictAvail) worstDictAvail = avail;
memset(D,0,sizeof(WORDENTRY));
return D;
}
WORDP StoreWord(int val) // create a number word
{
char value[MAX_WORD_SIZE];
sprintf(value,(char*)"%d",val);
return StoreWord(value);
}
WORDP StoreWord(char* word, uint64 properties, uint64 flags)
{
WORDP D = StoreWord(word,properties);
AddSystemFlag(D,flags);
return D;
}
WORDP StoreWord(char* word, uint64 properties)