Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions TextSearch/Common/Types.ecl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Types for search system
// Types for search system

EXPORT Types := MODULE
EXPORT DocNo := UNSIGNED4;
Expand All @@ -16,8 +16,9 @@ EXPORT Types := MODULE
SymbolChar, // Ampersand, Section, et cetera
NoiseChar, // Noise, such as a comma or Tab
WhiteSpace, // blanks
SpecialStr); // special keyword string
EXPORT TermTypeAsString(TermType typ) := CASE(typ,
SpecialStr, // special keyword string
AcroStr); //Initialism and Acronyms
EXPORT TermTypeAsString(TermType typ) := CASE(typ,
1 => V'Text String',
2 => V'Number',
3 => V'Date',
Expand All @@ -27,9 +28,11 @@ EXPORT Types := MODULE
7 => V'Noise Character',
8 => V'White Space',
9 => V'Special Keyword',
10 => V'Initialism and Acronyms',
V'Unknown');
EXPORT KeywordTTypes := [TermType.TextStr, TermType.Number,
TermType.Date, TermType.SymbolChar];
TermType.Date, TermType.SymbolChar,
TermType.AcroStr];
EXPORT InvertTTypes := [TermType.TextStr, TermType.Number,
TermType.Date, TermType.Meta,
TermType.Tag, TermType.SymbolChar,
Expand Down
77 changes: 77 additions & 0 deletions TextSearch/Inverted/Initialism.ecl
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
IMPORT TextSearch.Inverted;
IMPORT TextSearch.Common;
IMPORT STD;
IMPORT TextSearch2.Inverted.Layouts;
#option('outputLimit',100);

prefix := '~thor::jdh::';
inputName := prefix + 'corrected_lda_ap_txtt_xml';

Work1 := RECORD
UNICODE doc_number{XPATH('/DOC/DOCNO')};
UNICODE content{MAXLENGTH(32000000),XPATH('<>')};
UNICODE text{MAXLENGTH(32000000),XPATH('/DOC/TEXT')};
UNSIGNED8 file_pos{VIRTUAL(fileposition)};
UNICODE init;
END;

Inverted.Layouts.DocumentIngest cvt(Work1 lr) := TRANSFORM
SELF.identifier := TRIM(lr.doc_number, LEFT,RIGHT);
SELF.seqKey := inputName + '-' + INTFORMAT(lr.file_pos,12,1);
SELF.slugLine := lr.text[1..STD.Uni.Find(lr.text,'.',1)+1];
SELF.content := lr.content;
SELF.init:=lr.content;
END;

stem := prefix + 'corrected_lda_ap_txtt_xml';
instance := 'initial2';
ds0 := DATASET(inputName, Work1, XML('/DOC', NOROOT));
inDocs := PROJECT(ds0, cvt(LEFT));
info := Common.FileName_Info_Instance(stem, instance);
expr:=U'[a-zA-Z][.][a-zA-Z][.]*[a-zA-Z]*[.]*[a-zA-Z]*';
expr2:='[a-zA-Z][.][a-zA-Z][.]*[a-zA-Z]*[.]*[a-zA-Z]*';
enumDocs:= Inverted.EnumeratedDocs(info, inDocs);
p1 := Inverted.ParsedText(enumDocs);
rawPostings := Inverted.RawPostings(enumDocs);
OUTPUT(rawPostings);
ValRec := RECORD
unicode val;
END;
DNrec := RECORD
RawPostings ;
DATASET(ValRec) Values;
END;

DNrec filter(rawPostings L) := TRANSFORM
SetStrVals := REGEXFINDSET(expr2,(STRING)L.term)+Std.Str.SplitWords((STRING)L.term,'.');
ValuesDS := DATASET(SetStrVals,{STRING StrVal});
SELF.Values := PROJECT(ValuesDS,
TRANSFORM(ValRec,
SELF.val := (unicode)Left.StrVal));
SELF:=l;
END;
NestedDS := PROJECT(rawPostings,filter(LEFT));
NestedDS;

OutRec := RECORD
RawPostings;
unicode val;
END;

res:=NORMALIZE(NestedDS,COUNT(LEFT.Values),
TRANSFORM(OutRec,
SELF.val := LEFT.Values[COUNTER].val,Self.term:=LEFT.Values[COUNTER].val,SELF.len:=length(LEFT.Values[COUNTER].val),SELF.kwp:=LEFT.kwp+COUNTER,SELF.keywords:=if(length(LEFT.Values[COUNTER].val)=1,1,LEFT.keywords)
,SELF.lentext:=length(LEFT.Values[COUNTER].val),SELF.typterm:=if(length(LEFT.Values[COUNTER].val)=1,1,LEFT.typterm)/*,SELF.lp:=if(LEFT.lp=0,,LEFT.lp)*/;
SELF := LEFT,
));

output(res);
PATTERN expr3 :=PATTERN('[a-zA-Z][.][a-zA-Z]*[.][a-zA-Z]*[.]*[a-zA-Z]*');
PATTERN expr4 :=PATTERN('[a-zA-Z][.][a-zA-Z]*');
PATTERN expr5 :=PATTERN('[a-zA-Z]+');
TOKEN JustAWord := expr3 expr5;
RULE NounPhraseComp1 := JustAWord ;
ps1 := {
out1 := MATCHTEXT(NounPhraseComp1) };
p14 := PARSE(res, val, NounPhraseComp1, ps1, BEST,MANY,NOCASE);
output(p14,NAMED('Result_4'));
1 change: 1 addition & 0 deletions TextSearch/Inverted/Layouts.ecl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ EXPORT Layouts := MODULE
Types.SequenceKey seqKey;
Types.SlugLine slugLine;
UNICODE content;
UNICODE init;
END;
EXPORT DocumentNo := RECORD
Types.DocNo id;
Expand Down
31 changes: 31 additions & 0 deletions TextSearch/Inverted/Moby.ecl
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/*
Moby Thesaurus is an aopen source set of files used in this project to return set of synonomus
you can download it from this link:http://www.gutenberg.org/catalog/world/results?title=moby+list
and spray the dataset in ECL watch as delimated cvs file using the defult delimater
*/
#option('outputLimit',100);

import std;
CSVRecord := RECORD
string word;

END;
file3 := DATASET('~thor::jdh::moby',
CSVrecord,
CSV(HEADING(1),
SEPARATOR([',']),
TERMINATOR(['\n'])));
file3;

cont:= RECORD
unicode term;
set of unicode synonyms;
END;
cont filter(file3 doc) := TRANSFORM

SELF.term:=STD.STr.SplitWords(doc.word,',')[1];
SELF.synonyms:=STD.STr.SplitWords(doc.word,',')[2..];
SELF := doc;
END;
s:= PROJECT(file3, filter(LEFT));
output(s);
45 changes: 40 additions & 5 deletions TextSearch/Inverted/ParsedText.ecl
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
// Parse contents of the document
// Parse contents of the document
IMPORT TextSearch;
IMPORT TextSearch.Common;
IMPORT TextSearch.Inverted.Layouts;
IMPORT STD;
Document := Layouts.Document;
RawPosting := Layouts.RawPosting;
Types := Common.Types;
Expand Down Expand Up @@ -37,11 +38,14 @@ EXPORT DATASET(RawPosting) ParsedText(DATASET(Document) docsInput) := FUNCTION
PATTERN EmptyEnd := REPEAT(AttrListItem) OPT(Spaces) U'/>';
PATTERN XMLElement := U'<' XMLName BEFORE ContainerEnd;
PATTERN XMLEmpty := U'<' XMLName BEFORE EmptyEnd;

RULE myRule := XMLDecl OR XMLComment OR XMLElement OR XMLEmpty OR

PATTERN Initialism_Pattern :=PATTERN(U'[a-zA-Z]+[.][a-zA-Z]+[.][a-zA-Z]*[.]*[a-zA-Z]*');

RULE myRule := Initialism_Pattern OR WordAllLower OR WordAllUpper OR WordTitleCase OR WordMixedCase OR
XMLDecl OR XMLComment OR XMLElement OR XMLEmpty OR
AttributeExpr OR EndElement OR TagEndSeq OR
WordAlphaNum OR WhiteSpace OR PoundCode OR
SymbolChar OR Noise OR AnyChar OR AnyPair;
SymbolChar OR Noise OR AnyChar OR AnyPair OR WordNoLetters;

RawPosting parseString(Document doc) := TRANSFORM
SELF.id := doc.id;;
Expand All @@ -58,12 +62,24 @@ EXPORT DATASET(RawPosting) ParsedText(DATASET(Document) docsInput) := FUNCTION
MATCHED(WordAlphaNum) => MATCHLENGTH(MyRule),
MATCHED(AnyChar) => MATCHLENGTH(MyRule),
MATCHED(AnyPair) => MATCHLENGTH(MyRule),
MATCHED(Initialism_Pattern) => MATCHLENGTH(MyRule),
MATCHED(WordAllUpper) => MATCHLENGTH(MyRule),
MATCHED(WordAllLower) => MATCHLENGTH(MyRule),
MATCHED(WordMixedCase) => MATCHLENGTH(MyRule),
MATCHED(WordNoLetters) => MATCHLENGTH(MyRule),
MATCHED(WordTitleCase) => MATCHLENGTH(MyRule),
0);
SELF.keywords := MAP(
MATCHED(SymbolChar) => 1,
MATCHED(WordAlphaNum) => 1,
MATCHED(AnyChar) => 1,
MATCHED(AnyPair) => 1,
MATCHED(Initialism_Pattern) => MATCHLENGTH(Initialism_Pattern)- STD.Str.FindCount((STRING)MATCHTEXT(Initialism_Pattern), '.'),
MATCHED(WordAllUpper) =>1,
MATCHED(WordAllLower) =>1,
MATCHED(WordMixedCase) =>1,
MATCHED(WordTitleCase) =>1,
MATCHED(WordNoLetters) =>1,
0);
SELF.typTerm := MAP(
MATCHED(WhiteSpace) => Types.TermType.WhiteSpace,
Expand All @@ -80,6 +96,12 @@ EXPORT DATASET(RawPosting) ParsedText(DATASET(Document) docsInput) := FUNCTION
MATCHED(EndElement) => Types.TermType.Tag,
MATCHED(TagEndSeq) => Types.TermType.Tag,
MATCHED(PoundCode) => Types.TermType.TextStr,
MATCHED(Initialism_Pattern) => Types.TermType.AcroStr,
MATCHED(WordAllUpper) => Types.TermType.TextStr,
MATCHED(WordAllLower) => Types.TermType.TextStr,
MATCHED(WordMixedCase) => Types.TermType.TextStr,
MATCHED(WordTitleCase) => Types.TermType.TextStr,
MATCHED(WordNoLetters) => Types.TermType.SymbolChar,
Types.TermType.Unknown);
SELF.typData := MAP(
MATCHED(WhiteSpace) => Types.DataType.RawData,
Expand All @@ -97,6 +119,12 @@ EXPORT DATASET(RawPosting) ParsedText(DATASET(Document) docsInput) := FUNCTION
MATCHED(EndElement) => Types.DataType.EndElement,
MATCHED(TagEndSeq) => Types.DataType.TagEndSeq,
MATCHED(PoundCode) => Types.DataType.RawData,
MATCHED(Initialism_Pattern) => Types.DataType.RawData,
MATCHED(WordAllUpper) => Types.DataType.RawData,
MATCHED(WordAllLower) => Types.DataType.RawData,
MATCHED(WordMixedCase) => Types.DataType.RawData,
MATCHED(WordTitleCase) => Types.DataType.RawData,
MATCHED(WordNoLetters) => Types.DataType.RawData,
Types.DataType.Unknown);
SELF.tagValue := MAP(
NOT MATCHED(AttributeExpr) => U'',
Expand All @@ -113,7 +141,14 @@ EXPORT DATASET(RawPosting) ParsedText(DATASET(Document) docsInput) := FUNCTION
SELF.preorder := 0;
SELF.parentOrd := 0;
SELF.parentName:= U'';
SELF.lp := Types.LetterPattern.Unknown;
SELF.lp := MAP(
MATCHED(WordAllUpper) => Types.LetterPattern.UpperCase,
MATCHED(WordAllLower) => Types.LetterPattern.LowerCase,
MATCHED(WordMixedCase) => Types.LetterPattern.MixedCase,
MATCHED(WordNoLetters) => Types.LetterPattern.NoLetters,
MATCHED(WordTitleCase) => Types.LetterPattern.TitleCase,
Types.LetterPattern.Unknown);

SELF.term := MATCHUNICODE(MyRule);
END;
p0 := PARSE(docsInput, content, myRule, parseString(LEFT), MAX, MANY, NOT MATCHED);
Expand Down
5 changes: 3 additions & 2 deletions TextSearch/Inverted/RawPostings.ecl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//Convert raw content into posting records
//Convert raw content into posting records
IMPORT TextSearch.Common;
IMPORT TextSearch.Common.Types;
IMPORT TextSearch.Inverted;
Expand Down Expand Up @@ -114,10 +114,11 @@ EXPORT GROUPED DATASET(Posting) RawPostings(DATASET(Document) docIn) := FUNCTION
SELF.parentName := topParentName;
SELF.lenText := IF(closeElement, st.lenText, posting.lenText);
SELF.keywords := IF(closeElement, st.keywords, posting.keywords);
SELF.typData :=IF(SELF.depth>0 and posting.typData =Types.DataType.RawData ,Types.DataType.PCDATA,Types.DataType.RawData);
SELF := posting;
END;
initalV := ROW(initState());
p2 := PROCESS(p1, initalV, assign(LEFT,RIGHT), next(LEFT,RIGHT), LOCAL);
p3 := GROUP(p2, id) : ONWARNING(1037, IGNORE);
RETURN p3;
END;
END;
131 changes: 131 additions & 0 deletions TextSearch/Inverted/States.ecl
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
IMPORT TextSearch.Inverted;
IMPORT TextSearch.Common;
IMPORT STD;
IMPORT TextSearch.Inverted.Layouts;
Import python;

#option('outputLimit',100);
prefix := '~thor::jdh::';
inputName := prefix + 'corrected_lda_ap_txtt_xml';

Work1 := RECORD
UNICODE doc_number{XPATH('/DOC/DOCNO')};
UNICODE content{MAXLENGTH(32000000),XPATH('<>')};
UNICODE text{MAXLENGTH(32000000),XPATH('/DOC/TEXT')};
UNSIGNED8 file_pos{VIRTUAL(fileposition)};
UNICODE init;
END;

Inverted.Layouts.DocumentIngest cvt(Work1 lr) := TRANSFORM
SELF.identifier := TRIM(lr.doc_number, LEFT,RIGHT);
SELF.seqKey := inputName + '-' + INTFORMAT(lr.file_pos,12,1);
SELF.slugLine := lr.text[1..STD.Uni.Find(lr.text,'.',1)+1];
SELF.content := lr.content;
SELF.init:=lr.content;
END;


stem := prefix + 'corrected_lda_ap_txtt_xml';
instance := 'initial2';
ds0 := DATASET(inputName, Work1, XML('/DOC', NOROOT));
inDocs := PROJECT(ds0, cvt(LEFT));
info := Common.FileName_Info_Instance(stem, instance);
enumDocs := Inverted.EnumeratedDocs(info, inDocs);
p1 := Inverted.ParsedText(enumDocs);
rawPostings := Inverted.RawPostings(enumDocs);

OUTPUT(rawPostings,,'~ONLINE::Farah::OUT::Solution1',OVERWRITE);

rec := RECORD
UNICODE code;
UNICODE state;
END;
Ds := DATASET([{'AK', 'Alaska'},
{'AL', 'Alabama'},
{'AR', 'Arkansas'},
{'AS', 'American Samoa'},
{'AZ', 'Arizona'},
{'CA', 'California'},
{'CO', 'Colorado'},
{'CT', 'Connecticut'},
{'DC', 'District of Columbia'},
{'DE', 'Delaware'},
{'FL', 'Florida'},
{'GA', 'Georgia'},
{'GU', 'Guam'},
{'HI', 'Hawaii'},
{'IA', 'Iowa'},
{'ID', 'Idaho'},
{'IL', 'Illinois'},
{'IN', 'Indiana'},
{'KS', 'Kansas'},
{'KY', 'Kentucky'},
{'LA', 'Louisiana'},
{'MA', 'Massachusetts'},
{'MD', 'Maryland'},
{'ME', 'Maine'},
{'MI', 'Michigan'},
{'MN', 'Minnesota'},
{'MO', 'Missouri'},
{'MP', 'Northern Mariana Islands'},
{'MS', 'Mississippi'},
{'MT', 'Montana'},
{'NA', 'National'},
{'NC', 'North Carolina'},
{'ND', 'North Dakota'},
{'NE', 'Nebraska'},
{'NH', 'New Hampshire'},
{'NJ', 'New Jersey'},
{'NM', 'New Mexico'},
{'NV', 'Nevada'},
{'NY', 'New York'},
{'OH', 'Ohio'},
{'OK', 'Oklahoma'},
{'OR', 'Oregon'},
{'PA', 'Pennsylvania'},
{'PR', 'Puerto Rico'},
{'RI', 'Rhode Island'},
{'SC', 'South Carolina'},
{'SD', 'South Dakota'},
{'TN', 'Tennessee'},
{'TX', 'Texas'},
{'UT', 'Utah'},
{'VA', 'Virginia'},
{'VI', 'Virgin Islands'},
{'VT', 'Vermont'},
{'WA', 'Washington'},
{'WI', 'Wisconsin'},
{'WV', 'West Virginia'},
{'WY', 'Wyoming'}],rec);

DsDCT := DICTIONARY(DS,{code => DS});
DsDCT2 := DICTIONARY(DS,{state => DS});
OUTPUT(rawPostings[0].term IN DsDCT2);

cont:= RECORD
rawPostings.term;
END;;
cont filter(Inverted.Layouts.RawPosting doc) := TRANSFORM

SELF.term:=if(doc.term IN DsDCT or doc.term IN DsDCT2,doc.term,'');;

SELF := doc;
END;
s:= PROJECT(rawPostings, filter(LEFT));
output(s);


ValRec := RECORD
unicode val;
END;
DNrec := RECORD
RawPostings ;
END;

DNrec filter3(rawPostings L) := TRANSFORM
unicode t:=L.term;
SELF.term:=if(L.term IN DsDCT or L.term IN DsDCT2,t,L.term);;
SELF:=l;
END;
NestedDS := PROJECT(rawPostings,filter3(LEFT));
output(NestedDS);
Loading