-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathsplitFile.py
More file actions
35 lines (25 loc) · 992 Bytes
/
splitFile.py
File metadata and controls
35 lines (25 loc) · 992 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import pandas as pd
# Creates 10 equally sized k samples from the main sorted dataset for crossvalidation
# Input: Full set of SORTED gt data - location should not change (data/gt/full.csv)
def shuffleAndSplit():
raw = pd.read_csv(r"data/gt/full.csv", usecols=[0, 1])
# Duplicates removed
raw.drop_duplicates()
raw['New Code'] = raw['New Code'].str.strip()
# Shuffles set
shuffled = raw.sample(frac=1)
# Gets number of rows
numOfRows = shuffled.shape[0]
print('Number of Rows in dataframe : ', numOfRows)
start = 0
subSection = int(numOfRows / 10)
counter = subSection
fileId = 1
while counter <= numOfRows:
print(start, counter, subSection)
print(shuffled[start:counter])
outputFile = "data/eval/k" + str(fileId) + ".csv"
fileId = fileId + 1
shuffled[start:counter].to_csv(outputFile, index=False, header=True)
start = start + subSection
counter = counter + subSection