forked from EveryTimeIWill18/Cython_Repo
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCythonFileTokenizer.pyx
More file actions
147 lines (127 loc) · 4.35 KB
/
Copy pathCythonFileTokenizer.pyx
File metadata and controls
147 lines (127 loc) · 4.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# -*- coding: utf-8 -*-
"""
data_mining.pyx
~~~~~~~~~~~~~~~
This module is a cython pyx file that is used to mine text efficiently
from the various support file formats.
"""
# -- python imports
import os
import sys
import time
# -- cython c imports
from libc.stdlib cimport malloc, realloc, free
from libc.stdio cimport fopen, fclose, FILE, EOF, fseek, SEEK_END, SEEK_SET
from libc.stdio cimport ftell, fgetc, fgets, getc, gets, feof, fread, getline
from libc.string cimport strlen, memcpy, strcpy, strtok, strchr, strncpy
from cython.parallel import prange, parallel, threadid
# preprocessor directive
DEF BUFFER = 100
# template for numeric types
ctypedef fused numeric_var:
int
long
long long
float
double
cdef readonly struct Token:
char **token_array
int *token_size_array
cdef readonly struct Columns:
char **column_array
cdef readonly struct Rows:
Columns *Col
cdef readonly struct DataContainer:
char ***data_frame
char *columns
char **data_array
cdef class Tokenize:
"""Tokenize the input file/string"""
cdef Token *Tok
cdef FILE *fp
cdef readonly:
char *filename
char *column_header
char *delimiter
char newline
char *file_contents
char current_char
int iterator
int c
int num_columns
long file_size
int num_tokens
bint is_open
bint EO_STR
def __init__(self, char *delimiter, char *filename):
self.Tok = <Token*>malloc(sizeof(Token))
self.delimiter = delimiter
self.newline = b"\n"
self.fp = NULL
self.filename = filename
self.column_header = NULL
self.file_contents = NULL
self.file_size = 0
self.num_tokens = 0
self.iterator = 0
self.current_char = b" "
self.c = 0
self.EO_STR = 0
self.num_columns = 0
def open_file(self):
"""Open the file for reading."""
self.fp = fopen(self.filename, "r")
if self.fp == NULL:
raise FileNotFoundError(2, "No such file or directory: '%s'" % self.filename)
else:
# the file is now open
self.is_open = 1
def close_file(self):
"""Close the opened file."""
if self.is_open == 1:
if self.fp != NULL:
fclose(self.fp)
self.is_open = 0
else:
raise Exception(2, "An error occurred trying to close the file: '%s'" % self.filename)
def read_in_file(self):
"""Read the file contents."""
if self.is_open == 1:
fseek(self.fp, 0, SEEK_END)
self.file_size = ftell(self.fp)
fseek(self.fp, 0, SEEK_SET)
self.file_contents = <char*>malloc(self.file_size*sizeof(char))
fread(self.file_contents, 1, self.file_size, self.fp)
#fclose(self.fp)
self.is_open = 0
def get_columns(self):
"""Set up the column names."""
if self.file_contents != NULL:
tmp = 0
while True:
print(<str>chr(self.file_contents[self.iterator]))
if <str>chr(self.file_contents[self.iterator]) == "\n":
self.num_columns += 1
tmp = self.iterator
break
if <str>chr(self.file_contents[self.iterator]) == "\0":
self.EO_STR = 0 # enf of string reached
break
if <str>chr(self.file_contents[self.iterator]) == ",":
self.num_columns += 1
tmp = self.iterator
self.iterator += 1
self.column_header = <char*>malloc(self.iterator*sizeof(char))
strncpy(self.column_header, self.file_contents, self.iterator)
# Test the functionality #
###############################################################################
emlFile = b"Y:\\Shared\\USD\\Business Data and Analytics\\Claims_Pipeline_Files\\Mapping_Files\\EmlMappingFile.csv"
tokenizer = Tokenize(b',', emlFile)
tokenizer.open_file()
tokenizer.read_in_file()
tokenizer.get_columns()
print(tokenizer.file_size)
print(tokenizer.is_open)
print(tokenizer.column_header)
tokenizer.close_file()
print(tokenizer.is_open)