-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsplit_data.py
More file actions
23 lines (20 loc) · 804 Bytes
/
split_data.py
File metadata and controls
23 lines (20 loc) · 804 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# -*- coding: utf-8 -*-
# Author: HeYing
# Creation Date: 2019-04-21
import pandas as pd
import os
quora = pd.read_csv('quora_duplicate_questions.tsv',sep="\t",header=0)
quora = quora.dropna()
quora.question1 = quora.question1.str.strip()
quora.question2 = quora.question2.str.strip()
quora.to_csv('clean_data.csv',sep='\t',index=False)
count=0
chunkrows = 2000 # read 2k rows at a time
headers = ["id","qid1","qid2","question1","question2","is_duplicate"]
df = pd.read_csv('clean_data.csv', sep='\t',header=0,iterator=True, chunksize=chunkrows)
os.mkdir('raw')
for chunk in df: # for each 2k rows
outname = 'raw/split_%d.csv'%count
#append each output to same csv, using no header
chunk.to_csv(outname, mode='a', header=headers, index=None,sep='\t')
count+=1