diff --git a/TextSearch/Common/Types.ecl b/TextSearch/Common/Types.ecl index 9128267..d9bbb3f 100644 --- a/TextSearch/Common/Types.ecl +++ b/TextSearch/Common/Types.ecl @@ -1,4 +1,4 @@ -// Types for search system +// Types for search system EXPORT Types := MODULE EXPORT DocNo := UNSIGNED4; @@ -16,8 +16,9 @@ EXPORT Types := MODULE SymbolChar, // Ampersand, Section, et cetera NoiseChar, // Noise, such as a comma or Tab WhiteSpace, // blanks - SpecialStr); // special keyword string - EXPORT TermTypeAsString(TermType typ) := CASE(typ, + SpecialStr, // special keyword string + AcroStr); //Initialism and Acronyms + EXPORT TermTypeAsString(TermType typ) := CASE(typ, 1 => V'Text String', 2 => V'Number', 3 => V'Date', @@ -27,9 +28,11 @@ EXPORT Types := MODULE 7 => V'Noise Character', 8 => V'White Space', 9 => V'Special Keyword', + 10 => V'Initialism and Acronyms', V'Unknown'); EXPORT KeywordTTypes := [TermType.TextStr, TermType.Number, - TermType.Date, TermType.SymbolChar]; + TermType.Date, TermType.SymbolChar, + TermType.AcroStr]; EXPORT InvertTTypes := [TermType.TextStr, TermType.Number, TermType.Date, TermType.Meta, TermType.Tag, TermType.SymbolChar, diff --git a/TextSearch/Inverted/Initialism.ecl b/TextSearch/Inverted/Initialism.ecl new file mode 100644 index 0000000..c4ecdf0 --- /dev/null +++ b/TextSearch/Inverted/Initialism.ecl @@ -0,0 +1,77 @@ +IMPORT TextSearch.Inverted; +IMPORT TextSearch.Common; +IMPORT STD; +IMPORT TextSearch2.Inverted.Layouts; +#option('outputLimit',100); + +prefix := '~thor::jdh::'; +inputName := prefix + 'corrected_lda_ap_txtt_xml'; + +Work1 := RECORD + UNICODE doc_number{XPATH('/DOC/DOCNO')}; + UNICODE content{MAXLENGTH(32000000),XPATH('<>')}; + UNICODE text{MAXLENGTH(32000000),XPATH('/DOC/TEXT')}; + UNSIGNED8 file_pos{VIRTUAL(fileposition)}; + UNICODE init; +END; + +Inverted.Layouts.DocumentIngest cvt(Work1 lr) := TRANSFORM + SELF.identifier := TRIM(lr.doc_number, LEFT,RIGHT); + SELF.seqKey := inputName + '-' + INTFORMAT(lr.file_pos,12,1); + SELF.slugLine := lr.text[1..STD.Uni.Find(lr.text,'.',1)+1]; + SELF.content := lr.content; + SELF.init:=lr.content; +END; + +stem := prefix + 'corrected_lda_ap_txtt_xml'; +instance := 'initial2'; +ds0 := DATASET(inputName, Work1, XML('/DOC', NOROOT)); +inDocs := PROJECT(ds0, cvt(LEFT)); +info := Common.FileName_Info_Instance(stem, instance); +expr:=U'[a-zA-Z][.][a-zA-Z][.]*[a-zA-Z]*[.]*[a-zA-Z]*'; +expr2:='[a-zA-Z][.][a-zA-Z][.]*[a-zA-Z]*[.]*[a-zA-Z]*'; +enumDocs:= Inverted.EnumeratedDocs(info, inDocs); +p1 := Inverted.ParsedText(enumDocs); +rawPostings := Inverted.RawPostings(enumDocs); +OUTPUT(rawPostings); +ValRec := RECORD + unicode val; +END; +DNrec := RECORD + RawPostings ; + DATASET(ValRec) Values; +END; + +DNrec filter(rawPostings L) := TRANSFORM + SetStrVals := REGEXFINDSET(expr2,(STRING)L.term)+Std.Str.SplitWords((STRING)L.term,'.'); + ValuesDS := DATASET(SetStrVals,{STRING StrVal}); + SELF.Values := PROJECT(ValuesDS, + TRANSFORM(ValRec, + SELF.val := (unicode)Left.StrVal)); + SELF:=l; +END; +NestedDS := PROJECT(rawPostings,filter(LEFT)); +NestedDS; + +OutRec := RECORD + RawPostings; + unicode val; +END; + +res:=NORMALIZE(NestedDS,COUNT(LEFT.Values), + TRANSFORM(OutRec, + SELF.val := LEFT.Values[COUNTER].val,Self.term:=LEFT.Values[COUNTER].val,SELF.len:=length(LEFT.Values[COUNTER].val),SELF.kwp:=LEFT.kwp+COUNTER,SELF.keywords:=if(length(LEFT.Values[COUNTER].val)=1,1,LEFT.keywords) + ,SELF.lentext:=length(LEFT.Values[COUNTER].val),SELF.typterm:=if(length(LEFT.Values[COUNTER].val)=1,1,LEFT.typterm)/*,SELF.lp:=if(LEFT.lp=0,,LEFT.lp)*/; + SELF := LEFT, + )); + +output(res); +PATTERN expr3 :=PATTERN('[a-zA-Z][.][a-zA-Z]*[.][a-zA-Z]*[.]*[a-zA-Z]*'); +PATTERN expr4 :=PATTERN('[a-zA-Z][.][a-zA-Z]*'); +PATTERN expr5 :=PATTERN('[a-zA-Z]+'); +TOKEN JustAWord := expr3 expr5; +RULE NounPhraseComp1 := JustAWord ; +ps1 := { +out1 := MATCHTEXT(NounPhraseComp1) }; +p14 := PARSE(res, val, NounPhraseComp1, ps1, BEST,MANY,NOCASE); +output(p14,NAMED('Result_4')); \ No newline at end of file diff --git a/TextSearch/Inverted/Layouts.ecl b/TextSearch/Inverted/Layouts.ecl index 67fe79b..945fe13 100644 --- a/TextSearch/Inverted/Layouts.ecl +++ b/TextSearch/Inverted/Layouts.ecl @@ -7,6 +7,7 @@ EXPORT Layouts := MODULE Types.SequenceKey seqKey; Types.SlugLine slugLine; UNICODE content; + UNICODE init; END; EXPORT DocumentNo := RECORD Types.DocNo id; diff --git a/TextSearch/Inverted/Moby.ecl b/TextSearch/Inverted/Moby.ecl new file mode 100644 index 0000000..3b9552c --- /dev/null +++ b/TextSearch/Inverted/Moby.ecl @@ -0,0 +1,31 @@ +/* +Moby Thesaurus is an aopen source set of files used in this project to return set of synonomus +you can download it from this link:http://www.gutenberg.org/catalog/world/results?title=moby+list +and spray the dataset in ECL watch as delimated cvs file using the defult delimater +*/ +#option('outputLimit',100); + +import std; +CSVRecord := RECORD + string word; + +END; + file3 := DATASET('~thor::jdh::moby', + CSVrecord, + CSV(HEADING(1), + SEPARATOR([',']), + TERMINATOR(['\n']))); +file3; + +cont:= RECORD + unicode term; + set of unicode synonyms; +END; +cont filter(file3 doc) := TRANSFORM + +SELF.term:=STD.STr.SplitWords(doc.word,',')[1]; +SELF.synonyms:=STD.STr.SplitWords(doc.word,',')[2..]; +SELF := doc; +END; +s:= PROJECT(file3, filter(LEFT)); +output(s); \ No newline at end of file diff --git a/TextSearch/Inverted/ParsedText.ecl b/TextSearch/Inverted/ParsedText.ecl index 59e7d40..472b15c 100644 --- a/TextSearch/Inverted/ParsedText.ecl +++ b/TextSearch/Inverted/ParsedText.ecl @@ -1,7 +1,8 @@ -// Parse contents of the document +// Parse contents of the document IMPORT TextSearch; IMPORT TextSearch.Common; IMPORT TextSearch.Inverted.Layouts; +IMPORT STD; Document := Layouts.Document; RawPosting := Layouts.RawPosting; Types := Common.Types; @@ -37,11 +38,14 @@ EXPORT DATASET(RawPosting) ParsedText(DATASET(Document) docsInput) := FUNCTION PATTERN EmptyEnd := REPEAT(AttrListItem) OPT(Spaces) U'/>'; PATTERN XMLElement := U'<' XMLName BEFORE ContainerEnd; PATTERN XMLEmpty := U'<' XMLName BEFORE EmptyEnd; - - RULE myRule := XMLDecl OR XMLComment OR XMLElement OR XMLEmpty OR + + PATTERN Initialism_Pattern :=PATTERN(U'[a-zA-Z]+[.][a-zA-Z]+[.][a-zA-Z]*[.]*[a-zA-Z]*'); + + RULE myRule := Initialism_Pattern OR WordAllLower OR WordAllUpper OR WordTitleCase OR WordMixedCase OR + XMLDecl OR XMLComment OR XMLElement OR XMLEmpty OR AttributeExpr OR EndElement OR TagEndSeq OR WordAlphaNum OR WhiteSpace OR PoundCode OR - SymbolChar OR Noise OR AnyChar OR AnyPair; + SymbolChar OR Noise OR AnyChar OR AnyPair OR WordNoLetters; RawPosting parseString(Document doc) := TRANSFORM SELF.id := doc.id;; @@ -58,12 +62,24 @@ EXPORT DATASET(RawPosting) ParsedText(DATASET(Document) docsInput) := FUNCTION MATCHED(WordAlphaNum) => MATCHLENGTH(MyRule), MATCHED(AnyChar) => MATCHLENGTH(MyRule), MATCHED(AnyPair) => MATCHLENGTH(MyRule), + MATCHED(Initialism_Pattern) => MATCHLENGTH(MyRule), + MATCHED(WordAllUpper) => MATCHLENGTH(MyRule), + MATCHED(WordAllLower) => MATCHLENGTH(MyRule), + MATCHED(WordMixedCase) => MATCHLENGTH(MyRule), + MATCHED(WordNoLetters) => MATCHLENGTH(MyRule), + MATCHED(WordTitleCase) => MATCHLENGTH(MyRule), 0); SELF.keywords := MAP( MATCHED(SymbolChar) => 1, MATCHED(WordAlphaNum) => 1, MATCHED(AnyChar) => 1, MATCHED(AnyPair) => 1, + MATCHED(Initialism_Pattern) => MATCHLENGTH(Initialism_Pattern)- STD.Str.FindCount((STRING)MATCHTEXT(Initialism_Pattern), '.'), + MATCHED(WordAllUpper) =>1, + MATCHED(WordAllLower) =>1, + MATCHED(WordMixedCase) =>1, + MATCHED(WordTitleCase) =>1, + MATCHED(WordNoLetters) =>1, 0); SELF.typTerm := MAP( MATCHED(WhiteSpace) => Types.TermType.WhiteSpace, @@ -80,6 +96,12 @@ EXPORT DATASET(RawPosting) ParsedText(DATASET(Document) docsInput) := FUNCTION MATCHED(EndElement) => Types.TermType.Tag, MATCHED(TagEndSeq) => Types.TermType.Tag, MATCHED(PoundCode) => Types.TermType.TextStr, + MATCHED(Initialism_Pattern) => Types.TermType.AcroStr, + MATCHED(WordAllUpper) => Types.TermType.TextStr, + MATCHED(WordAllLower) => Types.TermType.TextStr, + MATCHED(WordMixedCase) => Types.TermType.TextStr, + MATCHED(WordTitleCase) => Types.TermType.TextStr, + MATCHED(WordNoLetters) => Types.TermType.SymbolChar, Types.TermType.Unknown); SELF.typData := MAP( MATCHED(WhiteSpace) => Types.DataType.RawData, @@ -97,6 +119,12 @@ EXPORT DATASET(RawPosting) ParsedText(DATASET(Document) docsInput) := FUNCTION MATCHED(EndElement) => Types.DataType.EndElement, MATCHED(TagEndSeq) => Types.DataType.TagEndSeq, MATCHED(PoundCode) => Types.DataType.RawData, + MATCHED(Initialism_Pattern) => Types.DataType.RawData, + MATCHED(WordAllUpper) => Types.DataType.RawData, + MATCHED(WordAllLower) => Types.DataType.RawData, + MATCHED(WordMixedCase) => Types.DataType.RawData, + MATCHED(WordTitleCase) => Types.DataType.RawData, + MATCHED(WordNoLetters) => Types.DataType.RawData, Types.DataType.Unknown); SELF.tagValue := MAP( NOT MATCHED(AttributeExpr) => U'', @@ -113,7 +141,14 @@ EXPORT DATASET(RawPosting) ParsedText(DATASET(Document) docsInput) := FUNCTION SELF.preorder := 0; SELF.parentOrd := 0; SELF.parentName:= U''; - SELF.lp := Types.LetterPattern.Unknown; + SELF.lp := MAP( + MATCHED(WordAllUpper) => Types.LetterPattern.UpperCase, + MATCHED(WordAllLower) => Types.LetterPattern.LowerCase, + MATCHED(WordMixedCase) => Types.LetterPattern.MixedCase, + MATCHED(WordNoLetters) => Types.LetterPattern.NoLetters, + MATCHED(WordTitleCase) => Types.LetterPattern.TitleCase, + Types.LetterPattern.Unknown); + SELF.term := MATCHUNICODE(MyRule); END; p0 := PARSE(docsInput, content, myRule, parseString(LEFT), MAX, MANY, NOT MATCHED); diff --git a/TextSearch/Inverted/RawPostings.ecl b/TextSearch/Inverted/RawPostings.ecl index 0ab9c1c..57c7f0a 100644 --- a/TextSearch/Inverted/RawPostings.ecl +++ b/TextSearch/Inverted/RawPostings.ecl @@ -1,4 +1,4 @@ -//Convert raw content into posting records +//Convert raw content into posting records IMPORT TextSearch.Common; IMPORT TextSearch.Common.Types; IMPORT TextSearch.Inverted; @@ -114,10 +114,11 @@ EXPORT GROUPED DATASET(Posting) RawPostings(DATASET(Document) docIn) := FUNCTION SELF.parentName := topParentName; SELF.lenText := IF(closeElement, st.lenText, posting.lenText); SELF.keywords := IF(closeElement, st.keywords, posting.keywords); + SELF.typData :=IF(SELF.depth>0 and posting.typData =Types.DataType.RawData ,Types.DataType.PCDATA,Types.DataType.RawData); SELF := posting; END; initalV := ROW(initState()); p2 := PROCESS(p1, initalV, assign(LEFT,RIGHT), next(LEFT,RIGHT), LOCAL); p3 := GROUP(p2, id) : ONWARNING(1037, IGNORE); RETURN p3; -END; +END; \ No newline at end of file diff --git a/TextSearch/Inverted/States.ecl b/TextSearch/Inverted/States.ecl new file mode 100644 index 0000000..cedab3b --- /dev/null +++ b/TextSearch/Inverted/States.ecl @@ -0,0 +1,131 @@ +IMPORT TextSearch.Inverted; +IMPORT TextSearch.Common; +IMPORT STD; +IMPORT TextSearch.Inverted.Layouts; +Import python; + +#option('outputLimit',100); +prefix := '~thor::jdh::'; +inputName := prefix + 'corrected_lda_ap_txtt_xml'; + +Work1 := RECORD + UNICODE doc_number{XPATH('/DOC/DOCNO')}; + UNICODE content{MAXLENGTH(32000000),XPATH('<>')}; + UNICODE text{MAXLENGTH(32000000),XPATH('/DOC/TEXT')}; + UNSIGNED8 file_pos{VIRTUAL(fileposition)}; + UNICODE init; +END; + +Inverted.Layouts.DocumentIngest cvt(Work1 lr) := TRANSFORM + SELF.identifier := TRIM(lr.doc_number, LEFT,RIGHT); + SELF.seqKey := inputName + '-' + INTFORMAT(lr.file_pos,12,1); + SELF.slugLine := lr.text[1..STD.Uni.Find(lr.text,'.',1)+1]; + SELF.content := lr.content; + SELF.init:=lr.content; +END; + + +stem := prefix + 'corrected_lda_ap_txtt_xml'; +instance := 'initial2'; +ds0 := DATASET(inputName, Work1, XML('/DOC', NOROOT)); +inDocs := PROJECT(ds0, cvt(LEFT)); +info := Common.FileName_Info_Instance(stem, instance); +enumDocs := Inverted.EnumeratedDocs(info, inDocs); +p1 := Inverted.ParsedText(enumDocs); +rawPostings := Inverted.RawPostings(enumDocs); + +OUTPUT(rawPostings,,'~ONLINE::Farah::OUT::Solution1',OVERWRITE); + +rec := RECORD + UNICODE code; + UNICODE state; +END; +Ds := DATASET([{'AK', 'Alaska'}, + {'AL', 'Alabama'}, + {'AR', 'Arkansas'}, + {'AS', 'American Samoa'}, + {'AZ', 'Arizona'}, + {'CA', 'California'}, + {'CO', 'Colorado'}, + {'CT', 'Connecticut'}, + {'DC', 'District of Columbia'}, + {'DE', 'Delaware'}, + {'FL', 'Florida'}, + {'GA', 'Georgia'}, + {'GU', 'Guam'}, + {'HI', 'Hawaii'}, + {'IA', 'Iowa'}, + {'ID', 'Idaho'}, + {'IL', 'Illinois'}, + {'IN', 'Indiana'}, + {'KS', 'Kansas'}, + {'KY', 'Kentucky'}, + {'LA', 'Louisiana'}, + {'MA', 'Massachusetts'}, + {'MD', 'Maryland'}, + {'ME', 'Maine'}, + {'MI', 'Michigan'}, + {'MN', 'Minnesota'}, + {'MO', 'Missouri'}, + {'MP', 'Northern Mariana Islands'}, + {'MS', 'Mississippi'}, + {'MT', 'Montana'}, + {'NA', 'National'}, + {'NC', 'North Carolina'}, + {'ND', 'North Dakota'}, + {'NE', 'Nebraska'}, + {'NH', 'New Hampshire'}, + {'NJ', 'New Jersey'}, + {'NM', 'New Mexico'}, + {'NV', 'Nevada'}, + {'NY', 'New York'}, + {'OH', 'Ohio'}, + {'OK', 'Oklahoma'}, + {'OR', 'Oregon'}, + {'PA', 'Pennsylvania'}, + {'PR', 'Puerto Rico'}, + {'RI', 'Rhode Island'}, + {'SC', 'South Carolina'}, + {'SD', 'South Dakota'}, + {'TN', 'Tennessee'}, + {'TX', 'Texas'}, + {'UT', 'Utah'}, + {'VA', 'Virginia'}, + {'VI', 'Virgin Islands'}, + {'VT', 'Vermont'}, + {'WA', 'Washington'}, + {'WI', 'Wisconsin'}, + {'WV', 'West Virginia'}, + {'WY', 'Wyoming'}],rec); + +DsDCT := DICTIONARY(DS,{code => DS}); +DsDCT2 := DICTIONARY(DS,{state => DS}); +OUTPUT(rawPostings[0].term IN DsDCT2); + +cont:= RECORD + rawPostings.term; +END;; +cont filter(Inverted.Layouts.RawPosting doc) := TRANSFORM + +SELF.term:=if(doc.term IN DsDCT or doc.term IN DsDCT2,doc.term,'');; + +SELF := doc; +END; +s:= PROJECT(rawPostings, filter(LEFT)); +output(s); + + +ValRec := RECORD + unicode val; +END; +DNrec := RECORD + RawPostings ; +END; + +DNrec filter3(rawPostings L) := TRANSFORM + unicode t:=L.term; + SELF.term:=if(L.term IN DsDCT or L.term IN DsDCT2,t,L.term);; + SELF:=l; +END; +NestedDS := PROJECT(rawPostings,filter3(LEFT)); +output(NestedDS); \ No newline at end of file diff --git a/TextSearch/Inverted/Test_Moby.ecl b/TextSearch/Inverted/Test_Moby.ecl new file mode 100644 index 0000000..60395a3 --- /dev/null +++ b/TextSearch/Inverted/Test_Moby.ecl @@ -0,0 +1,47 @@ +/* +Moby Thesaurus is an aopen source set of files used in this project to return set of synonomus +you can download it from this link:http://www.gutenberg.org/catalog/world/results?title=moby+list +and spray the dataset in ECL watch as delimated cvs file using the defult delimater +*/ +#option('outputLimit',100); + +IMPORT std; +CSVRecord := RECORD + string word; + +END; + + file3 := DATASET('~thor::jdh::moby', + CSVrecord, + CSV(HEADING(1), + SEPARATOR([',']), + TERMINATOR(['\n']))); + +file3; +cont:= RECORD + + unicode term; + set of unicode synonyms; +END; +cont filter(file3 doc) := TRANSFORM + +SELF.term:=STD.STr.SplitWords(doc.word,',')[1]; +SELF.synonyms:=STD.STr.SplitWords(doc.word,',')[2..]; +SELF := doc; +END; + +s:= PROJECT(file3, filter(LEFT)); +unicode t:='Abaddon'; +output(s); + +cont2 := RECORD + unicode term; + set of unicode synonoms; + +END; +cont2 filter2(file3 doc) := TRANSFORM +SELF.term:=if(STD.STr.SplitWords(doc.word,',')[1]=t,STD.STr.SplitWords(doc.word,',')[1],''); +SELF.synonoms:=if(STD.STr.SplitWords(doc.word,',')[1]=t,STD.STr.SplitWords(doc.word,',')[2..],[]); +END; +s2:= PROJECT(file3, filter2(LEFT)); +output(s2); \ No newline at end of file diff --git a/TextSearch/Inverted/word2vec.ecl b/TextSearch/Inverted/word2vec.ecl new file mode 100644 index 0000000..c8596a4 --- /dev/null +++ b/TextSearch/Inverted/word2vec.ecl @@ -0,0 +1,66 @@ +IMPORT Python; +IMPORT TextSearch2.Inverted; +IMPORT TextSearch2.Common; +IMPORT STD; +IMPORT TextSearch2.Inverted.Layouts; +#option('outputLimit',100); + +namerec := RECORD + string name; +END; + +prefix := '~thor::jdh::'; +inputName := prefix + 'corrected_lda_ap_txtt_xml'; + +Work1 := RECORD + UNICODE doc_number{XPATH('/DOC/DOCNO')}; + UNICODE content{MAXLENGTH(32000000),XPATH('<>')}; + UNICODE text{MAXLENGTH(32000000),XPATH('/DOC/TEXT')}; + UNSIGNED8 file_pos{VIRTUAL(fileposition)}; + UNICODE init; +END; + +Inverted.Layouts.DocumentIngest cvt(Work1 lr) := TRANSFORM + SELF.identifier := TRIM(lr.doc_number, LEFT,RIGHT); + SELF.seqKey := inputName + '-' + INTFORMAT(lr.file_pos,12,1); + SELF.slugLine := lr.text[1..STD.Uni.Find(lr.text,'.',1)+1]; + SELF.content := lr.content; + SELF.init:=lr.content; +END; + +stem := prefix + 'corrected_lda_ap_txtt_xml'; +instance := 'initial2'; +ds0 := DATASET(inputName, Work1, XML('/DOC', NOROOT)); +inDocs := PROJECT(ds0, cvt(LEFT)); +info := Common.FileName_Info_Instance(stem, instance); +enumDocs := Inverted.EnumeratedDocs(info, inDocs); +p1 := Inverted.ParsedText(enumDocs); +rawPostings := Inverted.RawPostings(enumDocs); + +OUTPUT(enumDocs,,'~ONLINE::Farah::OUT::Solution',OVERWRITE); + +rec0 := RECORD + unicode cell; +END; + +rec := RECORD +DATASET(rec0) arow; +END; + +import python; +DATASET(rec0) word2vec(dataset(Inverted.Layouts.DocumentIngest) A) := embed(Python) + + import numpy as np + import re + import gensim + + s=[] + for n in A: + s.append(gensim.utils.simple_preprocess(unicode(n.content))) + model = gensim.models.Word2Vec(s,size=150,window=10,min_count=2,workers=10) + model.train(s,total_examples=len(s),epochs=10) + w1 = "school" + r= model.wv.most_similar(positive=w1) + return r +endembed; + OUTPUT(CHOOSEN(word2vec(inDocs), 200), ALL, NAMED('First_200_blocks')); \ No newline at end of file diff --git a/TextSearch/Inverted/word2vec_1.ecl b/TextSearch/Inverted/word2vec_1.ecl new file mode 100644 index 0000000..104ce06 --- /dev/null +++ b/TextSearch/Inverted/word2vec_1.ecl @@ -0,0 +1,69 @@ +IMPORT TextSearch.Inverted; +IMPORT TextSearch.Common; +IMPORT STD; +IMPORT TextSearch.Inverted.Layouts; +IMPORT Python; +#option('outputLimit',100); + +namerec := RECORD + string name; +END; + +prefix := '~thor::jdh::'; +inputName := prefix + 'corrected_lda_ap_txtt_xml'; + +Work1 := RECORD + UNICODE doc_number{XPATH('/DOC/DOCNO')}; + UNICODE content{MAXLENGTH(32000000),XPATH('<>')}; + UNICODE text{MAXLENGTH(32000000),XPATH('/DOC/TEXT')}; + UNSIGNED8 file_pos{VIRTUAL(fileposition)}; + UNICODE init; +END; + +Inverted.Layouts.DocumentIngest cvt(Work1 lr) := TRANSFORM + SELF.identifier := TRIM(lr.doc_number, LEFT,RIGHT); + SELF.seqKey := inputName + '-' + INTFORMAT(lr.file_pos,12,1); + SELF.slugLine := lr.text[1..STD.Uni.Find(lr.text,'.',1)+1]; + SELF.content := lr.content; + SELF.init:=lr.content; +END; + + +stem := prefix + 'corrected_lda_ap_txtt_xml'; +instance := 'initial2'; + +ds0 := DATASET(inputName, Work1, XML('/DOC', NOROOT)); +inDocs := PROJECT(ds0, cvt(LEFT)); + +info := Common.FileName_Info_Instance(stem, instance); +enumDocs := Inverted.EnumeratedDocs(info, inDocs); +p1 := Inverted.ParsedText(enumDocs); +rawPostings := Inverted.RawPostings(enumDocs); + +OUTPUT(enumDocs,,'~ONLINE::Farah::OUT::Solution77',OVERWRITE); + +rec0 := RECORD + unicode cell; +END; + +rec := RECORD +DATASET(rec0) arow; +END; + +import python; +DATASET(rec0) word2vec(dataset(Inverted.Layouts.DocumentIngest) A) := embed(Python) + + import numpy as np + import re + import gensim + + s=[] + for n in A: + s.append(gensim.utils.simple_preprocess(unicode(n.content))) + model = gensim.models.Word2Vec(s,size=150,window=10,min_count=2,workers=10) + model.train(s,total_examples=len(s),epochs=10) + w1 = "school" + r= model.wv.most_similar(positive=w1) + return r +endembed; + OUTPUT(CHOOSEN(word2vec(inDocs), 200), ALL, NAMED('First_200_blocks')); \ No newline at end of file diff --git a/TextSearch/Inverted/word2vec_2.ecl b/TextSearch/Inverted/word2vec_2.ecl new file mode 100644 index 0000000..16a6782 --- /dev/null +++ b/TextSearch/Inverted/word2vec_2.ecl @@ -0,0 +1,69 @@ +IMPORT TextSearch.Inverted; +IMPORT TextSearch.Common; +IMPORT STD; +IMPORT TextSearch.Inverted.Layouts; +IMPORT Python; + +#option('outputLimit',100); + +namerec := RECORD + string name; +END; + +prefix := '~thor::jdh::'; +inputName := prefix + 'corrected_lda_ap_txtt_xml'; + +Work1 := RECORD + UNICODE doc_number{XPATH('/DOC/DOCNO')}; + UNICODE content{MAXLENGTH(32000000),XPATH('<>')}; + UNICODE text{MAXLENGTH(32000000),XPATH('/DOC/TEXT')}; + UNSIGNED8 file_pos{VIRTUAL(fileposition)}; + UNICODE init; +END; + +Inverted.Layouts.DocumentIngest cvt(Work1 lr) := TRANSFORM + SELF.identifier := TRIM(lr.doc_number, LEFT,RIGHT); + SELF.seqKey := inputName + '-' + INTFORMAT(lr.file_pos,12,1); + SELF.slugLine := lr.text[1..STD.Uni.Find(lr.text,'.',1)+1]; + SELF.content := lr.content; + SELF.init:=lr.content; +END; + +stem := prefix + 'corrected_lda_ap_txtt_xml'; +instance := 'initial2'; +ds0 := DATASET(inputName, Work1, XML('/DOC', NOROOT)); +inDocs := PROJECT(ds0, cvt(LEFT)); +info := Common.FileName_Info_Instance(stem, instance); +enumDocs := Inverted.EnumeratedDocs(info, inDocs); +p1 := Inverted.ParsedText(enumDocs); +rawPostings := Inverted.RawPostings(enumDocs); + +OUTPUT(enumDocs,,'~ONLINE::Farah::OUT::Solution77',OVERWRITE); +rec0 := RECORD + set of unicode cell; +END; +rec := RECORD +DATASET(rec0) arow; +END; + +import python; +DATASET(rec0) word2vec(dataset(Inverted.Layouts.DocumentIngest) A, unicode word) := embed(Python) + + import numpy as np + import re + import gensim + + s=[] + for n in A: + s.append(gensim.utils.simple_preprocess(unicode(n.content))) + model = gensim.models.Word2Vec(s,size=150,window=10,min_count=2,workers=10) + model.train(s,total_examples=len(s),epochs=10) + w1 =word.split() + r=[] + for i in w1: + r.append([i,unicode(model.wv.most_similar(positive=(i)))]) + return r +endembed; +query:=u'students in school' ; +res:=word2vec(inDocs,query); +Output(res); diff --git a/TextSearch/Inverted/word2vec_3.ecl b/TextSearch/Inverted/word2vec_3.ecl new file mode 100644 index 0000000..69e8b8c --- /dev/null +++ b/TextSearch/Inverted/word2vec_3.ecl @@ -0,0 +1,95 @@ +IMPORT TextSearch.Inverted; +IMPORT TextSearch.Common; +IMPORT STD; +IMPORT TextSearch.Inverted.Layouts; +IMPORT Python; +#option('outputLimit',100); + +namerec := RECORD + string name; +END; + +prefix := '~thor::jdh::'; +inputName := prefix + 'corrected_lda_ap_txtt_xml'; + +Work1 := RECORD + UNICODE doc_number{XPATH('/DOC/DOCNO')}; + UNICODE content{MAXLENGTH(32000000),XPATH('<>')}; + UNICODE text{MAXLENGTH(32000000),XPATH('/DOC/TEXT')}; + UNSIGNED8 file_pos{VIRTUAL(fileposition)}; + UNICODE init; +END; + + +Inverted.Layouts.DocumentIngest cvt(Work1 lr) := TRANSFORM + SELF.identifier := TRIM(lr.doc_number, LEFT,RIGHT); + SELF.seqKey := inputName + '-' + INTFORMAT(lr.file_pos,12,1); + SELF.slugLine := lr.text[1..STD.Uni.Find(lr.text,'.',1)+1]; + SELF.content := lr.content; + SELF.init:=lr.content; +END; + +stem := prefix + 'corrected_lda_ap_txtt_xml'; +instance := 'initial2'; + +ds0 := DATASET(inputName, Work1, XML('/DOC', NOROOT)); +inDocs := PROJECT(ds0, cvt(LEFT)); +info := Common.FileName_Info_Instance(stem, instance); + +OUTPUT(inDocs); + +rec0 := RECORD + unicode cell; +END; + +rec := RECORD +DATASET(rec0) arow; +END; + +import python; +DATASET(rec0) word2vec(dataset(Inverted.Layouts.DocumentIngest) A, unicode word) := embed(Python) + + import numpy as np + import re + import gensim + + s=[] + for n in A: + s.append(gensim.utils.simple_preprocess(unicode(n.content))) + model = gensim.models.Word2Vec(s,size=150,window=10,min_count=2,workers=10) + model.train(s,total_examples=len(s),epochs=10) + w1 =word.split() + r=[] + for i in w1: + r.append([i,unicode(model.wv.most_similar(positive=(i)))]) + + return (r[0][1]).split(',') +endembed; + +query:=u'students in school' ; +res:=word2vec(inDocs,query); +Output(res); +rec2 := RECORD + DATASET (Inverted.Layouts.DocumentIngest) cell; +END; + Dataset(rec2) filter(dataset(Inverted.Layouts.DocumentIngest) A, DATASET (rec0) B) := embed(Python) + + import numpy as np + import re + import gensim + s=[] + r=[] + m=[] + l=[] + + for i in B: + for n in A: + if (unicode (n.content).find(unicode(i.cell))!=0): + if (n.content not in m): + m.append([n.content]) + l.append([n]) + return l +endembed; +res2:=filter(inDocs,res); +Output(res2); +OUTPUT(CHOOSEN(res2, 100), ALL, NAMED('First_100_blocks')); \ No newline at end of file