-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathUnderCalc.cs
More file actions
619 lines (482 loc) · 17.3 KB
/
UnderCalc.cs
File metadata and controls
619 lines (482 loc) · 17.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
/*
* Created by SharpDevelop.
* User: User
* Date: 14/10/2011
* Time: 21:08
*
* To change this template use Tools | Options | Coding | Edit Standard Headers.
*/
using System;
using System.IO;
using System.Collections;
using JavaConvert.Data;
using System.Text;
using System.Windows.Forms;
//using MyLogger;
namespace WordListAnalyser2
{
// <summary>
// Description of UnderCalc.
// </summary>
// Class to calculate the conflation index and the understemming
// index from a stemmed file
// Based on Java code by
// Rob Hooper
// 03/01/2004
public class UnderCalc
{
//instance variables
//static string analysefile;
static string infile;
static StreamReader stronginput, weakinput ;
static ArrayList wordgroup = new ArrayList(); //was Vector in Java
static ArrayList tempwordgroup = new ArrayList();//was Vector in Java
static string strongbarrier = "====";
static string weakbarrier = "----";
static double strongGDMT, strongGUMT, strongUI, strongCI, weakGDMT, weakGUMT,weakUI, weakCI;
static porterData givenPData;
public static StringBuilder sb = new StringBuilder(); // tom 03/11/2015 for debugging DMT();
public static int numDMTCalls = 0;
public UnderCalc(string s)//s = stemmedFile.txt file or truncated file
{
//MessageBox.Show(s, "UnderCalc");
infile = s;
//System.IO.StreamReader sr = System.IO.File.OpenText(infile) ;
strongGDMT = 0.0;
strongGUMT = 0.0;
strongUI = 0.0;
strongCI = 0.0;
weakGDMT = 0.0;
weakGUMT = 0.0;
weakUI = 0.0;
weakCI = 0.0;
//calculateResults();
}
public UnderCalc(porterData data)
{
//MessageBox.Show(s, "UnderCalc");
givenPData = data;
//System.IO.StreamReader sr = System.IO.File.OpenText(infile) ;
givenPData.resetCursor();
strongGDMT = 0.0;
strongGUMT = 0.0;
strongUI = 0.0;
strongCI = 0.0;
weakGDMT = 0.0;
weakGUMT = 0.0;
weakUI = 0.0;
weakCI = 0.0;
//calculateResults();
}
//= new StreamReader(new FileStream(infile, FileMode.Open));//was BufferedReader in Java
//public UnderCalc()
//{
//try {
//infile ="";
//using (FileStream fs = new FileStream(analysefile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) // Must be FileShare.ReadWrite, FileShare.Read doesn't work for some odd reason.
//using (StreamReader sr = new StreamReader(fs))
//infile = sr.ReadToEnd();
//} catch (Exception ex) {
//MessageBox.Show (ex.Message, "UnderCalc",MessageBoxButtons.OK,MessageBoxIcon.Exclamation);
//}
//}
// end of constructor
//Method to sort the word group
public static void sortWordGroup()
{
string str1, str2;
bool change = true;
int wordgroupsize = wordgroup.Count;
string sWordGroupSize = Convert.ToString(wordgroupsize);
//MessageBox.Show("In sortWordGroup. Size= " + sWordGroupSize);
// begin loop
while(change)
{
change = false;
for(int x = 0; x+1 < wordgroupsize; x++)
{
str1 = Convert.ToString(wordgroup[x]);
str2 = Convert.ToString(wordgroup[x+1]);
if(str1.CompareTo(str2) > 0)
{
//wordgroup.setElementAt(str1, x+1);
//wordgroup.setElementAt(str2, x);
// HACK Tom 03/11/2015 wordgroup.insert is adding whilst in java is replacing
// instead I am setting these values manually and overwriting whatever was already there
// as is done in java program
//wordgroup.Insert(x+1, str1);//index, val
wordgroup[x+1] = str1;
//wordgroup.Insert(x, str2);
wordgroup[x] = str2;
change = true;
}
} // end of for
} // end of while 'change'
}
// end of method 'sortWordGroup'
//Method to read in the next group of words where weak barriers are ignored
public static void getNextWeakWordGroup(string s)
{
//string[] splitinline;
string inLine;
//int recode;
// clear the wordgroup
wordgroup.Clear();
//set inline variable to passed parameter
inLine = s;
//MessageBox.Show(inLine,"inline, UnderCalc getNextWeakWordGroup");
// check for error of two barriers in file
if(inLine.Contains(strongbarrier)) inLine = weakinput.ReadLine();
//loop through file until next barrier found
while(!(inLine.Contains(strongbarrier)))
{
// if weak barrier found ignore and read in next line
if(inLine.Contains(weakbarrier)) inLine = weakinput.ReadLine();
// add next element to wordgroup
wordgroup.Add(inLine);
inLine = weakinput.ReadLine();
// if end of file break from loop
if(inLine == null) break;
} // end of while
sortWordGroup();
}
public static void pd_getNextWeakWordGroup(string s)
{
//string[] splitinline;
string inLine;
//int recode;
// clear the wordgroup
wordgroup.Clear();
//set inline variable to passed parameter
inLine = s;
//MessageBox.Show(inLine,"inline, UnderCalc getNextWeakWordGroup");
// check for error of two barriers in file
if(inLine.Contains(strongbarrier)) inLine = givenPData.readLine();
//loop through file until next barrier found
while(!(inLine.Contains(strongbarrier)))
{
// if weak barrier found ignore and read in next line
if(inLine.Contains(weakbarrier)) inLine = givenPData.readLine();
// add next element to wordgroup
wordgroup.Add(inLine);
inLine = givenPData.readLine();
// if end of file break from loop
if(inLine == null) break;
} // end of while
sortWordGroup();
}
// end of method 'getNextWeakWordGroup'
//Method read in the next group of words where weak barriers are treated as strong
public static void getNextStrongWordGroup(string w)
{
//TODO remove debug
//MessageBox.Show("entered method", "getNextStrongWordGroup");
//string[] splitinline;
string inline;// stem, origword, ending, recodestring, inline;
//int recode;
//clear wordgroup
wordgroup.Clear();
// set inline variable to passed parameter
inline = w;
//MyLog.WriteToLog(false,true,"getNextStrongWordGroup - value of w(inline)= ",w,"");
// check for error of two barriers in file
if(inline.Contains(strongbarrier) || inline.Contains(weakbarrier)) inline = stronginput.ReadLine();
// loop through file until weak/strong barrier found
while(!(inline.Contains(strongbarrier)) && !(inline.Contains(weakbarrier)))
{
// add next element to wordgroup
wordgroup.Add(inline);
//MyLog.WriteToLog(false,true,"getNextStrongWordGroup - in loop - next is readline from stream. Inline= ",inline,"");
//StreamReader stronginputx = new StreamReader(analysefile);
inline = stronginput.ReadLine();
// if end of file break from loop
if(inline == null) break;
} // end of while
sortWordGroup();
}
public static void pd_getNextStrongWordGroup(string w)
{
//string[] splitinline;
string inline;// stem, origword, ending, recodestring, inline;
//int recode;
//clear wordgroup
wordgroup.Clear();
// set inline variable to passed parameter
inline = w;
//TODO remove debug
//MyLog.WriteToLog(false,true,"getNextStrongWordGroup - value of w(inline)= ",w,"");
// check for error of two barriers in file
if(inline.Contains(strongbarrier) || inline.Contains(weakbarrier)) inline = givenPData.readLine();
// loop through file until weak/strong barrier found
while(!(inline.Contains(strongbarrier)) && !(inline.Contains(weakbarrier)))
{
// add next element to wordgroup
wordgroup.Add(inline);
//MyLog.WriteToLog(false,true,"getNextStrongWordGroup - in loop - next is readline from stream. Inline= ",inline,"");
//StreamReader stronginputx = new StreamReader(analysefile);
inline = givenPData.readLine();
// if end of file break from loop
if(inline == null) break;
} // end of while
sortWordGroup();
}
// end of method 'getNextStrongWordGroup'
//Method to calculate the 'desired merge total' (DMT
public static double DMT()
{
numDMTCalls++; // tom debugging 03/11/2015
double dmt = 0;
//retrieve the size of the wordgroup
int dNg = wordgroup.Count;//was .size in Java
//System.Diagnostics.Debug.WriteLine("DMT() 194 wordgroup.count =" + wordgroup.Count);
//calculate the desired merge total using formula
if(dNg != 0) dmt = (double)((0.5) * dNg) * (dNg - 1);
//System.Diagnostics.Debug.WriteLine("UnderCalc DMT() 195: dNg= " + dNg + " , dmt=" + dmt);
sb.Append("\n" + wordgroup.Count + " , " + dmt);
return dmt;
}
// end of method 'DMT'
// Method to calculate 'Unachieved Merge Total' (UMT)
public static double UMT()
{
double umt = 0;
double tempumt;
//retrieve wordgroup size
int uNg = wordgroup.Count;
string suNg = Convert.ToString(uNg);
//TODO remove debug logging
//MyLog.WriteToLog(false,true,"UMT - size of wordgroup - uNg = ",suNg,"");
int stemgroupsize = 1;
int[] nga = new int[uNg + 1];
int nogroups = 0;
int x = 0;
int ng;
string str1 = "";
string str2 = "";
string temp1, temp2;
StringTokenizer st1, st2;
//check for single word group and return zero
if(uNg == 1) umt = 0;
else
{
//loop through group to check for distinct stems from same concept group
while(x+1 < uNg)
{
//retrieve next two elements from word group
temp1 = Convert.ToString(wordgroup[x]); //was .ElementAt in Java
temp2 = Convert.ToString(wordgroup[x+1]);// removed(string)
//tokenize and retrieve stems from group elements
st1 = new StringTokenizer(temp1);
st2 = new StringTokenizer(temp2);
str1 = st1.NextToken;
str2 = st2.NextToken;
//TODO remove debug
//MyLog.WriteToLog(false,true,"UMT - str1 = ",str1,"");
//MyLog.WriteToLog(false,true,"UMT - str2 = ",str2," (next check if str1 = str2)");
//if stems equals increment stemgroup size counter
if(str1.Equals(str2)){
stemgroupsize++;
//string sGroupSize = Convert.ToString(stemgroupsize);
//TODO remove debug
//MyLog.WriteToLog(false,true,"UMT Match, new stem group size= ",sGroupSize,"");
}
// if stems differ add stemgroup size to array and reset variables
else
{
nga[nogroups] = stemgroupsize;
stemgroupsize = 1;
nogroups++;
//string snoGroups = Convert.ToString(nogroups);
//TODO remove debug
//MyLog.WriteToLog(false,true,"UMT no match - new number of groups= ",snoGroups,"");
}
x++;
} // end of while 'x+1 < uNg'
// add last stemgroup size to array
nga[nogroups] = stemgroupsize;
// sum over all distinct stemgroups
for(int i = 0; i <= nogroups; i++)
{
ng = nga[i];
tempumt = (double)(ng * (uNg - ng));
umt = umt + tempumt;
} // end of for
}
// halve current undesired merge total
umt = (double)0.5 * umt;
return umt;
}
// end of method 'UMT'
//Method to calculate the indices for strong values
public static void strongCalc()
{
FileStream fs1 = new FileStream(infile, FileMode.Open);
using (stronginput = new StreamReader(fs1))
//using (System.IO.StreamReader stronginput = System.IO.File.OpenText(infile))
{
wordgroup.Clear();
string snextline = "";
//read in first line
snextline = stronginput.ReadLine();
// obtain rest of wordgroup after inline
while((snextline != null) && (snextline.Contains ("//") == false))
{
getNextStrongWordGroup(snextline);
strongGDMT = (double)strongGDMT + DMT();
strongGUMT = (double)strongGUMT + UMT();
snextline = stronginput.ReadLine();
} // end of while 'snextline != null
if(strongGDMT.Equals(0)) strongUI = 0.0;
else strongUI = (double)strongGUMT/strongGDMT;
strongCI = (double)(1 - strongUI);
}
//added 16 oct 11
stronginput.Close(); //without this get an error at stage 4 - stemmedFile.txt in use.
}
public static void pd_strongCalc()
{
//using (System.IO.StreamReader stronginput = System.IO.File.OpenText(infile))
{
givenPData.resetCursor();
wordgroup.Clear();
string snextline = "";
//read in first line
snextline = givenPData.readLine();
// obtain rest of wordgroup after inline
while((snextline != null) && (snextline.Contains ("//") == false))
{
pd_getNextStrongWordGroup(snextline);
strongGDMT = (double)strongGDMT + DMT();
strongGUMT = (double)strongGUMT + UMT();
snextline = givenPData.readLine();
} // end of while 'snextline != null
if(strongGDMT.Equals(0)) strongUI = 0.0;
else strongUI = (double)strongGUMT/strongGDMT;
strongCI = (double)(1 - strongUI);
}
//added 16 oct 11
// stronginput.Close(); //without this get an error at stage 4 - stemmedFile.txt in use.
}
// end of method "strongCalc"
//Method to calculate the indices for weak values
public static void weakCalc()
{
//initialise new buffer to read from input file
FileStream fs1 = new FileStream(infile, FileMode.Open);
// HACK Tom 28/10/2015 removed 'using' as weakinput seems to be globally used (however needs closing!)
// using (weakinput = new StreamReader(fs1))
//
// {
weakinput = new StreamReader(fs1);
//read in first line from file
string wnextline = weakinput.ReadLine();
int numLines = 0;
String lastLineNotNull = "unassigned";
double weakGDMTBefore = weakGDMT;
// loop through
int numZeroDMT = 0;
int numNonZeroDMT = 0;
while((wnextline != null) && (wnextline.Contains ("//") == false))
{
numLines ++;
getNextWeakWordGroup(wnextline);
double mDMT = DMT();
if (mDMT.Equals(0.0)) numZeroDMT++;
else numNonZeroDMT++;
weakGDMT = (double)weakGDMT + DMT();
weakGUMT = (double)weakGUMT + UMT();
wnextline = weakinput.ReadLine();
if (wnextline != null)
{
lastLineNotNull = wnextline;
}
} //end of while 'wnextline != null)
System.Diagnostics.Debug.WriteLine("UnderCalc.cs 373: weakGDMT = " + weakGDMT + ", weakGUMT =" + weakGUMT + ", numLines =" + numLines );
System.Diagnostics.Debug.WriteLine("Undercalc.cs 374: numZeroDMT =" + numZeroDMT + ", numNonZeroDMT =" + numNonZeroDMT + " , weakGDMTBefore =" + weakGDMTBefore);
weakinput.Close(); // closing this 28/10/2015 tom
if(weakGDMT.Equals(0)) weakUI = 0;
else weakUI = (double)weakGUMT/weakGDMT;
weakCI = (double)(1 - weakUI);
// }
}// end of method "weakCalc"
public static void pd_weakCalc()
{
// //initialise new buffer to read from input file
//
// FileStream fs1 = new FileStream(infile, FileMode.Open);
// // HACK Tom 28/10/2015 removed 'using' as weakinput seems to be globally used (however needs closing!)
// using (weakinput = new StreamReader(fs1))
//
// {
//
// weakinput = new StreamReader(fs1);
//read in first line from file
givenPData.resetCursor();
string wnextline = givenPData.readLine();
int numLines = 0;
String lastLineNotNull = "unassigned";
double weakGDMTBefore = weakGDMT;
// loop through
int numZeroDMT = 0;
int numNonZeroDMT = 0;
while((wnextline != null) && (wnextline.Contains ("//") == false))
{
numLines ++;
pd_getNextWeakWordGroup(wnextline);
double mDMT = DMT();
if (mDMT.Equals(0.0)) numZeroDMT++;
else numNonZeroDMT++;
weakGDMT = (double)weakGDMT + DMT();
weakGUMT = (double)weakGUMT + UMT();
wnextline = givenPData.readLine();
if (wnextline != null)
{
lastLineNotNull = wnextline;
}
} //end of while 'wnextline != null)
System.Diagnostics.Debug.WriteLine("UnderCalc.cs 373: weakGDMT = " + weakGDMT + ", weakGUMT =" + weakGUMT + ", numLines =" + numLines );
System.Diagnostics.Debug.WriteLine("Undercalc.cs 374: numZeroDMT =" + numZeroDMT + ", numNonZeroDMT =" + numNonZeroDMT + " , weakGDMTBefore =" + weakGDMTBefore);
// weakinput.Close(); // closing this 28/10/2015 tom
if(weakGDMT.Equals(0)) weakUI = 0;
else weakUI = (double)weakGUMT/weakGDMT;
weakCI = (double)(1 - weakUI);
// }
}// end of method "weakCalc"
//Method call both calculation methods
public static void calculateResults()
{
System.Diagnostics.Debug.WriteLine("UnderCalc.cs CalculateResults 366: infile:" + infile);
//System.Diagnostics.Debug.WriteLine("calculateResults (UnderCalc.cs 366), strongCI=" + strongCI + " strongGDMT=" + strongGDMT + " strongGUMT=" + strongGUMT + " strongUI=" + strongUI);
//System.Diagnostics.Debug.WriteLine("calculateResults (UnderCalc.cs 367), weakCI" + weakCI + " weakGDMT" + weakGDMT + " weakGUMT" + weakGUMT + " weakUI" + weakUI);
// calculate strong and weak indices
strongCalc();
weakCalc();
//close buffers
weakinput.Close();
stronginput.Close();
}
// end of method "calculateResults"
public static void pd_calculateResults()
{
//System.Diagnostics.Debug.WriteLine("UnderCalc.cs CalculateResults 366: infile:" + infile);
//System.Diagnostics.Debug.WriteLine("calculateResults (UnderCalc.cs 366), strongCI=" + strongCI + " strongGDMT=" + strongGDMT + " strongGUMT=" + strongGUMT + " strongUI=" + strongUI);
//System.Diagnostics.Debug.WriteLine("calculateResults (UnderCalc.cs 367), weakCI" + weakCI + " weakGDMT" + weakGDMT + " weakGUMT" + weakGUMT + " weakUI" + weakUI);
// calculate strong and weak indices
pd_strongCalc();
pd_weakCalc();
//close buffers
//weakinput.Close();
//stronginput.Close();
}
//Methods to return indices
public double getStrongGDMT(){return strongGDMT;}
public double getStrongGUMT(){return strongGUMT;}
public double getStrongUI(){return strongUI;}
public double getStrongCI(){return strongCI;}
public double getWeakGDMT(){return weakGDMT;}
public double getWeakGUMT(){return weakGUMT;}
public double getWeakUI(){return weakUI;}
public double getWeakCI(){return weakCI;}
}
// *** end of class "UnderCalc"
}