-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathquery.rb
More file actions
75 lines (61 loc) · 1.6 KB
/
query.rb
File metadata and controls
75 lines (61 loc) · 1.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
require_relative 'inverted_index'
class Query
attr_accessor :path, :query, :q_tokens, :q_vector, :qtf, :index, :matches, :file_weights
MAX_RESULTS = 5
def initialize(dir_path)
@path = dir_path
@index = InvertedIndex.new(dir_path)
@index.start
end
def find q
init_query(q)
search_index
end
def init_query(q)
@q_vector = {}
@matches = {}
@file_weights = {}
@query = q
tokenize_query
query_vector
end
def search_index
find_matches
if @matches.empty?
puts "No results"
return
end
cosine_similarity(matches)
results
end
def tokenize_query
@q_tokens = @index.tokenize(@query)
# finding query token frequency, qtf
@qtf = @q_tokens.group_by { |w| w }.map { |w, ws| [w, ws.length] }.to_h
end
def query_vector
msq = @qtf.map { |w, f| f*f }.reduce(:+)
msqrt = Math.sqrt(msq)
@qtf.each { |w, f| @q_vector[w] = f.to_f/msqrt }
end
def find_matches
@qtf.each { |w, f| @matches[w] = @index.terms[w] if @index.terms[w] }
end
def cosine_similarity(matches)
matching_files = @matches.map { |w, pos| pos[:occurences].keys }.flatten.uniq
matching_files.each do |file_id|
cumulative_wt = 0.0
@q_vector.each do |w, wtq|
wtd = (@index.document[file_id][w] || {})[:wt] || 0
cumulative_wt += wtd*wtq
end
@file_weights[file_id] = cumulative_wt
end
@file_weights = @file_weights.sort_by { |f, w| w }.reverse
end
def results
@file_weights.first(MAX_RESULTS).each_with_index do |f, i|
puts "#{i + 1} : #{@index.files[f[0]]}"
end
end
end