-
-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathguru.ex
711 lines (627 loc) · 15.5 KB
/
guru.ex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
--****
-- === guru.ex
--
-- Searches for the best articles that contain the words that you type.
-- Each word can contain * and ? wildcard characters.
-- The articles are given a score and presented to you sorted by score.
-- The scoring system strongly favors articles that contain several of
-- your words, rather than just several occurrences of one of your words.
-- Some very common words are ignored (see noise_words).
-- e.g.
-- {{{
-- guru sequence* atom *pend g?r?
-- }}}
--
-- Results are displayed on screen and also saved in "c:\guru.out"
-- or $HOME/guru.out (Linux).
--
-- ==== Hints
-- * remember to add * to words that can be pluralized or have many different endings.
-- * enter an important word twice to double the value of that word
-- * If you get a "Critical Error", type 'i' for ignore. It just
-- means that a file is currently locked by another application.
--
-- ==== Usage
--
-- to search EUPHORIA directories:
-- {{{
-- guru word1 word2 word3 ...
-- }}}
--
-- Euphoria .doc and other files are searched. .htm files are skipped.
--
-- To search the current directory and all subdirectories:
-- {{{
-- cdguru word1 word2 word3 ...
-- }}}
--
without type_check
include std/filesys.e
include std/wildcard.e
include std/graphics.e as g
include std/sort.e
include std/sequence.e
include std/text.e
-------- some user-modifiable parameters:
sequence log_name, log_path, home
log_name = "guru.out"
-- place to store results
ifdef UNIX then
home = getenv("HOME") & ""
elsedef
home = getenv("HOMEDRIVE") & getenv("HOMEPATH")
end ifdef
if not find(-1, home) then
log_path = home & SLASH & log_name -- put in home dir if possible
else
log_path = log_name
end if
-- some files to skip:
sequence skip_list
ifdef UNIX then
skip_list = {
"*.so", "*.lib", "*.o",
"*.tar", "*.zip", "*.gz", "*.dylib"
}
elsedef
skip_list = {
"*.EXE", "*.DLL", "*.LIB", "*.OBJ",
"*.SWP", "*.PAR", "*.ZIP", "*.BMP",
"*.GIF", "*.JPG", "*.WAV"
}
end ifdef
-- ignore these extremely common words when searching
sequence noise_words
noise_words = {
"a", "an", "the", "to", "and", "of", "is", "or", "by", "as", "in",
"you", "are", "be", "if", "?", "*"
}
constant separator_line = repeat('-', 5)
constant MAX_CHUNKS = 20 -- maximum number of chunks to display
-- desired size for a chunk of text:
constant MIN_CHUNK_SIZE = 10, -- minimum number of lines
MAX_CHUNK_SIZE = 20 -- maximum number of lines
constant LEFT_HIGHLIGHT = 17, -- highlight markers for matched words
RIGHT_HIGHLIGHT = 16 -- (assume LEFT_HIGHLIGHT > RIGHT_HIGHLIGHT)
constant HIGHLIGHT_COLOR = BRIGHT_WHITE
-------- end of user-modifiable parameters
constant SCREEN = 1, ERR = 2
constant TRUE = 1, FALSE = 0
constant EOF = -1
type boolean(integer x)
return x = 0 or x = 1
end type
sequence word_list, word_count, file_spec
boolean euphoria
integer count_line
integer log_file
constant LINE_WIDTH = 83
constant TO_LOWER = 'a' - 'A'
function fast_lower(sequence s)
-- Faster than the standard lower().
-- Speed of lower() is important here.
integer c
for i = 1 to length(s) do
c = s[i]
if c <= 'Z' then
if c >= 'A' then
s[i] = c + TO_LOWER
end if
end if
end for
return s
end function
function clean(sequence line)
-- replace any funny control characters
-- and put in \n's to help break up long lines
sequence new_line
integer c, col
new_line = ""
col = 1
for i = 1 to length(line) do
if col > LINE_WIDTH then
new_line = append(new_line, '\n')
col = 1
end if
c = line[i]
col += 1
if c < 14 then
if c = '\n' then
col = 1
elsif c = '\r' then
c = ' '
elsif c != '\t' then
c = '.'
end if
end if
new_line = append(new_line, c)
end for
return new_line
end function
boolean display
display = TRUE
procedure both_puts(object text)
puts(log_file, text)
if display then
puts(SCREEN, text)
end if
end procedure
procedure both_printf(sequence format, object values)
printf(log_file, format, values)
if display then
printf(SCREEN, format, values)
end if
end procedure
constant MAX_LINE = 100
-- space for largest line
sequence buff
buff = repeat(0, MAX_LINE)
function safe_gets(integer fn)
-- Return the next line of text - always with \n on the end.
-- Lines are split at MAX_LINE to prevent
-- "out of memory" problems on humongous lines
-- and to reduce the amount of extraneous output.
integer c
for b = 1 to MAX_LINE - 1 do
c = getc(fn)
if c <= LEFT_HIGHLIGHT then
if c = '\n' then
buff[b] = c
return buff[1 .. b]
elsif c = EOF then
if b = 1 then
return EOF
else
buff[b] = '\n'
return buff[1 .. b]
end if
elsif c >= RIGHT_HIGHLIGHT or c = 0 then
c = '.'
end if
end if
buff[b] = c
end for
buff[MAX_LINE] = '\n'
return buff[1 .. MAX_LINE]
end function
function sum(sequence s)
-- sum of a sequence
atom sum
sum = 0
for i = 1 to length(s) do
sum += s[i]
end for
return sum
end function
object line
integer line_next
boolean words_on_line
sequence char_class
-- 0 means not legitimate
-- 1 means legitimate char
-- > 1 means possible first char of matching word
char_class = repeat(0, 255)
char_class['A' .. 'Z'] = 1
char_class['a' .. 'z'] = 1
char_class['0' .. '9'] = 1
char_class['_'] = 1
function has_punctuation(sequence word)
-- TRUE if word contains any punctuation characters
integer c
for i = 1 to length(word) do
c = word[i]
if char_class[c] = 0 and c != '?' and c != '*' then
return TRUE
end if
end for
return FALSE
end function
function next_word()
-- Return next possible matching word from line
-- based on first letter of the word.
sequence word
integer c
while TRUE do
-- skip white space:
while TRUE do
c = line[line_next]
line_next += 1
if char_class[c] > 0 then
exit
elsif c = '\n' then -- there's always a '\n' at end of line
return -1
end if
end while
words_on_line = TRUE
-- check first letter in word:
if char_class[c] > 1 then
-- possible matching word
word = { c }
-- read rest of word
while TRUE do
c = line[line_next]
if char_class[c] = 0 then
return word
end if
line_next += 1
word &= c
end while
else
-- not a possible matching word -skip it
while TRUE do
c = line[line_next]
if char_class[c] = 0 then
exit
end if
line_next += 1
end while
end if
end while
end function
sequence chunk_list
chunk_list = { { -1, {}, {} } }
procedure highlight(sequence text)
-- print a line with highlighted words in color
integer c
if not display then
return
end if
for i = 1 to length(text) do
c = text[i]
if c = LEFT_HIGHLIGHT then
text_color(HIGHLIGHT_COLOR)
elsif c = RIGHT_HIGHLIGHT then
text_color(WHITE)
else
puts(SCREEN, c)
end if
end for
end procedure
procedure print_chunk_list()
-- print the best chunks found
sequence chunk, line
position(count_line, 1)
for i = 1 to length(word_list) do
both_printf("%s:%d ", { word_list[i], word_count[i] })
end for
position(count_line + 1, 1)
puts(SCREEN, repeat(' ', 80))
puts(log_file, '\n')
for i = 1 to length(chunk_list) - 1 do
if i > 1 and display then
text_color(BRIGHT_GREEN)
puts(SCREEN, "\nPress q to quit, Enter for more:")
text_color(WHITE)
puts(SCREEN, " ")
if getc(0) = 'q' then
display = FALSE
end if
end if
text_color(RED)
both_printf("\n#%d of %d ------ %s --- score: %d ------\n",
{ i, length(chunk_list) - 1,
chunk_list[i][2], 100 * chunk_list[i][1] + 0.5 })
text_color(WHITE)
chunk = chunk_list[i][3]
g:wrap(FALSE)
for j = 1 to length(chunk) do
line = clean(chunk[j])
highlight(line)
puts(log_file, line)
end for
g:wrap(TRUE)
end for
if length(chunk_list) > 1 then
text_color(GREEN)
puts(SCREEN, "\nSee " & log_path & '\n')
end if
text_color(WHITE)
puts(SCREEN, " \n")
end procedure
procedure save_chunk(sequence file_name, sequence chunk, atom score)
-- record an interesting chunk on the chunk list
score /= 10 + sqrt(length(chunk)) -- reduce slightly for larger chunks
for i = 1 to length(chunk_list) do
if score > chunk_list[i][1] then
-- insert chunk into list at proper position
chunk_list = append(chunk_list[1 .. i - 1], { score, file_name, chunk })
& chunk_list[i .. length(chunk_list)]
if length(chunk_list) > MAX_CHUNKS + 1 then
-- drop the worst chunk on the list
chunk_list = chunk_list[1 .. length(chunk_list) - 1]
end if
exit
end if
end for
end procedure
sequence wild_word
procedure scan(sequence file_name)
-- read next file
integer fileNum, first_hit, last_hit, new_chunk
sequence lword, chunk, word_value
object word
atom chunk_total, line_total
boolean doc_file, matched, first_match
-- SKIP .svn dir
if match(".svn", file_name) then
return
end if
fileNum = open(file_name, "rb")
if fileNum = -1 then
return
end if
-- is it a Euphoria .doc file?
doc_file = euphoria and match(".doc", fast_lower(file_name))
-- update display
g:wrap(FALSE)
position(count_line, 1)
for i = 1 to length(word_list) do
printf(SCREEN, "%s:%d ", { word_list[i], word_count[i] })
end for
position(count_line + 1, 1)
puts(SCREEN, "searching: " & file_name & repeat(' ', 80) & '\r')
g:wrap(TRUE)
new_chunk = TRUE
while TRUE do
-- initialize
if new_chunk then
chunk = {}
chunk_total = 0
first_hit = 0
last_hit = 0
new_chunk = FALSE
word_value = repeat(1, length(word_list))
end if
line_next = 1
line_total = 0
-- read next line
line = safe_gets(fileNum)
if atom(line) then
exit -- end of file
end if
if get_key() = 'q' then
close(fileNum)
print_chunk_list()
abort(1)
end if
words_on_line = FALSE
while TRUE do
-- read next word in line
word = next_word()
if atom(word) then
exit
end if
lword = fast_lower(word)
first_match = TRUE
for i = 1 to length(word_list) do
if wild_word[i] then
-- slow
matched = wildcard:is_match(word_list[i], lword)
else
-- fast
matched = equal(word_list[i], lword)
end if
if matched then
-- score a bit higher for matching a non-wildcard word
line_total += word_value[i] * (1
+ 0.5 * (match(separator_line, line) != 0)
+ 0.3 * ( not wild_word[i])
+ 0.3 * doc_file)
word_count[i] += 1
word_value[i] /= 2
if first_match then
first_match = FALSE
line = line[1 .. line_next - length(word) - 1] &
LEFT_HIGHLIGHT &
word &
RIGHT_HIGHLIGHT &
line[line_next .. length(line)]
line_next += 2
end if
end if
end for
end while
chunk = append(chunk, line)
-- decide chunk boundaries
if words_on_line then
if line_total > 0 then
chunk_total += line_total
last_hit = length(chunk)
if first_hit = 0 then
first_hit = last_hit
end if
end if
if chunk_total > 0 then
if (line_total = 0 and
last_hit < length(chunk) - MIN_CHUNK_SIZE / 2 and
length(chunk) >= MIN_CHUNK_SIZE) or
length(chunk) >= MAX_CHUNK_SIZE then
if length(chunk) <= MIN_CHUNK_SIZE then
first_hit = 1
last_hit = length(chunk)
else
-- trim off some context, but not all
first_hit = floor((first_hit + 1) / 2)
last_hit = floor((last_hit + length(chunk)) / 2)
end if
save_chunk(file_name,
chunk[first_hit .. last_hit],
chunk_total)
new_chunk = TRUE
end if
elsif length(chunk) >= MIN_CHUNK_SIZE then
new_chunk = TRUE
end if
elsif chunk_total = 0 and length(chunk) > MIN_CHUNK_SIZE / 2 then
new_chunk = TRUE
end if
end while
if chunk_total > 0 then
save_chunk(file_name, chunk, chunk_total)
end if
close(fileNum)
return
end procedure
function look_at(sequence path_name, sequence direntry)
-- see if a file name qualifies for searching
sequence file_name
if find('d', direntry[D_ATTRIBUTES]) then
return 0 -- a directory
end if
file_name = direntry[D_NAME]
if equal(file_name, log_name) then
return 0 -- avoid circularity
end if
-- check skip list
for i = 1 to length(skip_list) do
if wildcard:is_match(skip_list[i], file_name) then
return 0
end if
end for
path_name &= SLASH
if equal(path_name[1 .. 2], '.' & SLASH) then
path_name = path_name[3 .. length(path_name)]
end if
path_name &= file_name
scan(path_name)
return 0
end function
procedure usage(sequence g)
text_color(MAGENTA)
puts(SCREEN, "\n\t\t" & g & " Guru\n\n")
text_color(WHITE)
puts(SCREEN,
"Enter keywords that will define the subject you are interested in. \n")
puts(SCREEN,
" - Upper/lower case is not important.\n")
puts(SCREEN,
" - Words may contain * and ? wildcard characters,\n")
puts(SCREEN,
" - example ---> get? input *routine*\n\n")
puts(SCREEN, "---> ")
end procedure
function blank_delim(sequence s)
-- break up a blank-delimited string
sequence list, segment
integer i
list = {}
i = 1
while i < length(s) do
while find(s[i], " \t") do
i += 1
end while
if s[i] = '\n' then
exit
end if
segment = ""
while not find(s[i], " \t\n") do
segment = segment & s[i]
i += 1
end while
list = append(list, segment)
end while
return list
end function
ifdef not UNIX then
log_name = upper(log_name)
end ifdef
clear_screen()
sequence cmd
cmd = command_line() -- eui guru.ex words...
euphoria = FALSE
if length(cmd) < 3 then
usage("Current Directory")
cmd = blank_delim(gets(0))
puts(SCREEN, '\n')
elsif equal(cmd[3], "E!") then
-- search Euphoria directories
euphoria = TRUE
if length(cmd) <= 3 then
usage("Euphoria")
cmd = blank_delim(gets(0))
puts(SCREEN, '\n')
else
cmd = cmd[4 .. length(cmd)]
end if
else
cmd = cmd[3 .. length(cmd)]
end if
log_file = open(log_path, "w")
if log_file = -1 then
puts(ERR, "Couldn't open " & log_path & '\n')
abort(1)
end if
word_list = {}
wild_word = {}
for i = 1 to length(cmd) do
cmd[i] = lower(cmd[i])
if find(cmd[i], noise_words) then
puts(SCREEN, "ignoring: " & cmd[i] & " (too common)\n")
elsif has_punctuation(cmd[i]) then
puts(SCREEN, "ignoring: " & cmd[i] &
" (contains punctuation character)\n")
else
word_list = append(word_list, cmd[i])
wild_word = append(wild_word, find('*', cmd[i]) or find('?', cmd[i]))
end if
end for
if length(word_list) = 0 then
abort(1)
end if
word_count = repeat(0, length(word_list))
integer first_char
-- prepare char_class[] for efficient detection of a
-- possible first letter in one of the words
for i = 1 to length(word_list) do
first_char = word_list[i][1]
if first_char = '*' or first_char = '?' then
char_class *= 2 -- select all allowed chars
exit
elsif char_class[first_char] > 0 then
char_class[first_char] = 2
-- select upper or lower case
if first_char >= 'A' and first_char <= 'Z' then
char_class[first_char - 'A' + 'a'] = 2
elsif first_char >= 'a' and first_char <= 'z' then
char_class[first_char - 'a' + 'A'] = 2
end if
end if
end for
file_spec = { "*.*" }
-- quits after finishing current file
puts(SCREEN, "Press q to quit\n\n\n")
sequence gp
gp = get_position()
count_line = gp[1] - 1
object d
if euphoria then
d = getenv("EUDIR")
if atom(d) then
puts(ERR, "EUDIR not set\n")
puts(ERR, "Please set EUDIR to the location of EUPHORIA and try again")
maybe_any_key()
abort(1)
end if
if sequence(dir(d)) then
-- reduce noise in Euphoria Help
skip_list &= { "*.HTM", "*.HTX", "*.DAT", "*.BAS", "*.BAT", "*.PRO",
"LW.DOC", "BIND.EX", "EX.ERR" }
ifdef UNIX then
skip_list = lower(skip_list)
end ifdef
if walk_dir(d, routine_id("look_at"), TRUE) then
end if
print_chunk_list()
abort(0)
end if
end if
puts(log_file, "Searching " & current_dir() & "\n\n")
sequence top_dir
if sequence(dir(".")) then
top_dir = "."
else
top_dir = current_dir()
end if
if walk_dir(top_dir, routine_id("look_at"), TRUE) then
end if
print_chunk_list()