From 9b1e218de2fc063292cc3719cc2c1f084cb7ebec Mon Sep 17 00:00:00 2001 From: Rohdin Johan A Date: Mon, 24 Feb 2020 15:09:11 +0100 Subject: [PATCH 1/5] Added functions for reading feature chunks. --- kaldi_io/kaldi_io.py | 177 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) diff --git a/kaldi_io/kaldi_io.py b/kaldi_io/kaldi_io.py index 498f9a8..eee566e 100755 --- a/kaldi_io/kaldi_io.py +++ b/kaldi_io/kaldi_io.py @@ -409,6 +409,7 @@ def _read_mat_binary(fd): mat = np.reshape(vec,(rows,cols)) return mat + def _read_mat_ascii(fd): rows = [] while 1: @@ -460,6 +461,182 @@ def _read_compressed_mat(fd, format): return mat.T # transpose! col-major -> row-major, +### +def read_file_segm(rxfile, start, end): + + assert( len(rxfile)==len(start)==len(end) ) + + data = [] + try: + for i,rxf in enumerate( rxfile ): + data.append( read_mat_n(rxf, start[i], end[i]) ) + except: + print("An exception occurred") + return data + +def read_mat_ark_n(file_or_fd, start, end): + """ generator(key,mat) = read_mat_ark(file_or_fd) + Returns generator of (key,matrix) tuples, read from ark file/stream. + file_or_fd : scp, gzipped scp, pipe or opened file descriptor. + + Iterate the ark: + for key,mat in kaldi_io.read_mat_ark(file): + ... + + Read ark to a 'dictionary': + d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } + """ + fd = open_or_fd(file_or_fd) + try: + key = read_key(fd) + while key: + mat = read_mat_n(fd, start, end) + yield key, mat + key = read_key(fd) + finally: + if fd is not file_or_fd : fd.close() + + +def read_mat_n(file_or_fd, start, end): + """ [mat] = read_mat(file_or_fd) + Reads single kaldi matrix, supports ascii and binary. + file_or_fd : file, gzipped file, pipe or opened file descriptor. + """ + fd = open_or_fd(file_or_fd) + try: + binary = fd.read(2).decode() + if binary == '\0B' : + mat = _read_mat_binary_n(fd, start, end) + else: + assert(binary == ' [') + mat = _read_mat_ascii(fd, start, end) + finally: + if fd is not file_or_fd: fd.close() + return mat + +def _read_mat_binary_n(fd, start, end): + # Data type + header = fd.read(3).decode() + # 'CM', 'CM2', 'CM3' are possible values, + if header.startswith('CM'): return _read_compressed_mat_n(fd, header, start, end) + else: raise NotImplementedError("Binary data without compression is not supported:") + assert(sample_size > 0) + # Dimensions + s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] + # Read whole matrix + rows=n + buf = fd.read(rows * cols * sample_size) + if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') + elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') + else : raise BadSampleSize + mat = np.reshape(vec,(rows,cols)) + return mat + + +def _read_mat_ascii_n(fd, start, end): + raise NotImplementedError("Text data not supported.") + + +def _read_compressed_mat_n(fd, format, start, end): + """ Read a compressed matrix, + see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h + methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), + """ + assert(format == 'CM ') # The formats CM2, CM3 are not supported... + + # Format of header 'struct', + global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, + per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) + + # Read global header, + globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] + rows_to_read = end - start + + # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] + # { cols }{ size } + col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) + col_headers = np.array([np.array([x for x in y]) * globrange * 1.52590218966964e-05 + globmin for y in col_headers], dtype=np.float32) + data = np.zeros((cols, rows_to_read), dtype='uint8') + header_offset = fd.tell() + for c in range(cols): + fd.seek((header_offset + rows*(c) +start) ) + data[c,:] = np.frombuffer(fd.read(rows_to_read), dtype='uint8', count=rows_to_read) + + # Seek to the next key + fd.seek((header_offset + rows*(c+1) ) ) + + mat = np.zeros((cols,rows_to_read), dtype='float32') + p0 = col_headers[:, 0].reshape(-1, 1) + p25 = col_headers[:, 1].reshape(-1, 1) + p75 = col_headers[:, 2].reshape(-1, 1) + p100 = col_headers[:, 3].reshape(-1, 1) + + mask_0_64 = (data <= 64) + mask_193_255 = (data > 192) + mask_65_192 = (~(mask_0_64 | mask_193_255)) + + mat += (p0 + (p25 - p0) / 64. * data) * mask_0_64.astype(np.float32) + mat += (p25 + (p75 - p25) / 128. * (data - 64)) * mask_65_192.astype(np.float32) + mat += (p75 + (p100 - p75) / 63. * (data - 192)) * mask_193_255.astype(np.float32) + + return mat.T # transpose! col-major -> row-major, + + +def get_durations(file_or_fd): + + global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, + durations = [] + fd = open_or_fd(file_or_fd) + try: + key = read_key(fd) + while key: + binary = fd.read(2).decode() + if binary == '\0B' : + header = fd.read(3).decode() + # 'FM', 'DM', 'CM', 'CM2', 'CM3' are possible values, + if (header == 'CM '): + # Read global header, + globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] + offset = fd.tell() + fd.seek( offset + cols*8 + rows*cols ) # 8 is the size of the column header + else: + raise NotImplementedError("Only compressed data in format CM is supported.") + else: + raise NotImplementedError("Text data not supported.") + durations.append( rows ) + key = read_key(fd) + finally: + if fd is not file_or_fd : fd.close() + return durations + +def get_durations_file_list( rxfile ): + + global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, + durations = [] + for rxf in rxfile: + try: + fd = open_or_fd(rxf) + binary = fd.read(2).decode() + if binary == '\0B' : + header = fd.read(3).decode() + # 'FM', 'DM', 'CM', 'CM2', 'CM3' are possible values, + if (header == 'CM '): + # Read global header, + globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] + offset = fd.tell() + fd.seek( offset + cols*8 + rows*cols ) # 8 is the size of the column header + else: + raise NotImplementedError("Only compressed data in format CM is supported.") + else: + raise NotImplementedError("Text data not supported.") + durations.append( rows ) + fd.close() + except: + print("An exception occurred when reading %s" %rxf) + sys.exit(-1) + return durations + + # Writing, def write_mat(file_or_fd, m, key=''): """ write_mat(f, m, key='') From b781084ed025ff30c7ea98d98a1a0e91ed281d93 Mon Sep 17 00:00:00 2001 From: Rohdin Johan A Date: Fri, 3 Apr 2020 16:02:39 +0200 Subject: [PATCH 2/5] Code for efficient reading of sliced features. Option to read data from an scp lines provided in a python list. --- README.md | 13 ++ kaldi_io/kaldi_io.py | 288 ++++++++++---------------------- tests/data/feats.scp | 6 + tests/data/feats_ascii.scp | 4 +- tests/data/feats_compressed.scp | 6 + tests/test_kaldi_io.py | 28 +++- 6 files changed, 141 insertions(+), 204 deletions(-) create mode 100644 tests/data/feats.scp create mode 100644 tests/data/feats_compressed.scp diff --git a/README.md b/README.md index 9da7084..40235e8 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,14 @@ kaldi-io-for-python - Matrix (float, double) - Posterior (posteriors, nnet1 training targets, confusion networks, ...) +#### Sclicing +scp files can contain entries like +"AMI_ES2011a_H00_FEE041_0003714_0003915_slice5 tests/data/feats.ark:14913[2:4,3:6]" +which in this case that row 2 and 3 and columns 3,4,5 are selected. For binary data +with/without compression, only the relevant elements will be read from disk. For ASCII +data, all data will be read and then sliced. Currently, only a step size of 1 is +supported. + #### Examples ###### Reading feature scp example: @@ -36,6 +44,11 @@ with kaldi_io.open_or_fd(ark_scp_output,'wb') as f: kaldi_io.write_mat(f, mat, key=key) ``` +### Reading scp in form of a list +It is als possible to read data using an "scp" stored in a python list of the form +["AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[:,7:13]", + "AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[20:30,7:13]"] + #### Install - from pypi: `python -m pip --user install kaldi_io` - from sources: diff --git a/kaldi_io/kaldi_io.py b/kaldi_io/kaldi_io.py index f85110c..a53fe95 100755 --- a/kaldi_io/kaldi_io.py +++ b/kaldi_io/kaldi_io.py @@ -8,7 +8,7 @@ from __future__ import division import numpy as np -import sys, os, re, gzip, struct +import sys, os, re, gzip, struct, io ################################################# # Adding 'kaldi binaries' to shell path, @@ -354,19 +354,30 @@ def read_mat_scp(file_or_fd): Read scp to a 'dictionary': d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } + + The scp can also be in a list of the form + ["AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[:,7:13]", + "AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[20:30,7:13]"] + """ - fd = open_or_fd(file_or_fd) + if isinstance(file_or_fd, list): fd = file_or_fd + else: fd = open_or_fd(file_or_fd) + try: for line in fd: - (key, rxfile) = line.decode().split(' ') + + if isinstance(line, str): (key, rxfile) = line.split(' ') + else: (key, rxfile) = line.decode().split(' ') + (rxfile, range_slice) = _strip_mat_range(rxfile) - # TODO, this reads whole file, and then selects the range. - # A faster solution would be to change API of read_mat() and load just the frames we need... - mat = read_mat(rxfile) - if range_slice is not None: mat = (mat[range_slice]).copy() # apply the range_slice, - # - + if range_slice is not None: + if ( (range_slice[0].step != None) or (len(range_slice)==2 and (range_slice[1].step != None)) ): + raise NotImplementedError("Step other than 1 in slices is currently not supported.") + mat = read_mat(rxfile, range_slice) + else: + mat = read_mat(rxfile) + yield key, mat finally: if fd is not file_or_fd : fd.close() @@ -430,7 +441,8 @@ def _strip_mat_range(rxfile_with_range): return (rxfile, tuple(slice_arr)) -def read_mat(file_or_fd): + +def read_mat(file_or_fd, range_slice=None): """ [mat] = read_mat(file_or_fd) Reads single kaldi matrix, supports ascii and binary. file_or_fd : file, gzipped file, pipe or opened file descriptor. @@ -439,32 +451,28 @@ def read_mat(file_or_fd): try: binary = fd.read(2).decode() if binary == '\0B' : - mat = _read_mat_binary(fd) + mat = _read_mat_binary(fd, range_slice) else: - assert(binary == ' [') mat = _read_mat_ascii(fd) + if range_slice is not None: mat = (mat[range_slice]).copy() + finally: if fd is not file_or_fd: fd.close() return mat -def _read_mat_binary(fd): + +def _read_mat_binary(fd, range_slice=None): # Data type header = fd.read(3).decode() # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles + if header.startswith('CM'): return _read_compressed_mat(fd, header, range_slice) + elif header == 'FM ': floatX ='float32' # floats + elif header == 'DM ': floatX = 'float64' # doubles else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) + # Dimensions s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat + return _read_range_slice(fd, rows, cols, floatX, range_slice=range_slice) def _read_mat_ascii(fd): @@ -482,13 +490,13 @@ def _read_mat_ascii(fd): return mat -def _read_compressed_mat(fd, format): +def _read_compressed_mat(fd, format, range_slice): """ Read a compressed matrix, see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), """ assert(format == 'CM ') # The formats CM2, CM3 are not supported... - + # Format of header 'struct', global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) @@ -496,133 +504,20 @@ def _read_compressed_mat(fd, format): # Read global header, globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] - # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) - col_headers = np.array([np.array([x for x in y]) * globrange * 1.52590218966964e-05 + globmin for y in col_headers], dtype=np.float32) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - - mat = np.zeros((cols,rows), dtype='float32') - p0 = col_headers[:, 0].reshape(-1, 1) - p25 = col_headers[:, 1].reshape(-1, 1) - p75 = col_headers[:, 2].reshape(-1, 1) - p100 = col_headers[:, 3].reshape(-1, 1) - mask_0_64 = (data <= 64) - mask_193_255 = (data > 192) - mask_65_192 = (~(mask_0_64 | mask_193_255)) - - mat += (p0 + (p25 - p0) / 64. * data) * mask_0_64.astype(np.float32) - mat += (p25 + (p75 - p25) / 128. * (data - 64)) * mask_65_192.astype(np.float32) - mat += (p75 + (p100 - p75) / 63. * (data - 192)) * mask_193_255.astype(np.float32) - - return mat.T # transpose! col-major -> row-major, - - -### -def read_file_segm(rxfile, start, end): - - assert( len(rxfile)==len(start)==len(end) ) - - data = [] - try: - for i,rxf in enumerate( rxfile ): - data.append( read_mat_n(rxf, start[i], end[i]) ) - except: - print("An exception occurred") - return data - -def read_mat_ark_n(file_or_fd, start, end): - """ generator(key,mat) = read_mat_ark(file_or_fd) - Returns generator of (key,matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_mat_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - mat = read_mat_n(fd, start, end) - yield key, mat - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - - -def read_mat_n(file_or_fd, start, end): - """ [mat] = read_mat(file_or_fd) - Reads single kaldi matrix, supports ascii and binary. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - """ - fd = open_or_fd(file_or_fd) - try: - binary = fd.read(2).decode() - if binary == '\0B' : - mat = _read_mat_binary_n(fd, start, end) - else: - assert(binary == ' [') - mat = _read_mat_ascii(fd, start, end) - finally: - if fd is not file_or_fd: fd.close() - return mat - -def _read_mat_binary_n(fd, start, end): - # Data type - header = fd.read(3).decode() - # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat_n(fd, header, start, end) - else: raise NotImplementedError("Binary data without compression is not supported:") - assert(sample_size > 0) - # Dimensions - s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - rows=n - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat - - -def _read_mat_ascii_n(fd, start, end): - raise NotImplementedError("Text data not supported.") - - -def _read_compressed_mat_n(fd, format, start, end): - """ Read a compressed matrix, - see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h - methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), - """ - assert(format == 'CM ') # The formats CM2, CM3 are not supported... - - # Format of header 'struct', - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) - - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - rows_to_read = end - start + # Standardize range_slice + if range_slice is None: range_slice = (slice(None,None,None),slice(None,None,None)) + elif len(range_slice) ==1 : range_slice = (range_slice[0],slice(None,None,None)) # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) + col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols)[range_slice[1]] col_headers = np.array([np.array([x for x in y]) * globrange * 1.52590218966964e-05 + globmin for y in col_headers], dtype=np.float32) - data = np.zeros((cols, rows_to_read), dtype='uint8') - header_offset = fd.tell() - for c in range(cols): - fd.seek((header_offset + rows*(c) +start) ) - data[c,:] = np.frombuffer(fd.read(rows_to_read), dtype='uint8', count=rows_to_read) - # Seek to the next key - fd.seek((header_offset + rows*(c+1) ) ) - - mat = np.zeros((cols,rows_to_read), dtype='float32') + # Note that, contrary to standard matrices, the compressed matrices are column-major + # so we need to flip rows and colums when using the below function for reading. + data = _read_range_slice(fd, cols, rows, 'uint8', range_slice=(range_slice[1],range_slice[0])) + + mat = np.zeros_like(data, dtype='float32') p0 = col_headers[:, 0].reshape(-1, 1) p25 = col_headers[:, 1].reshape(-1, 1) p75 = col_headers[:, 2].reshape(-1, 1) @@ -639,59 +534,54 @@ def _read_compressed_mat_n(fd, format, start, end): return mat.T # transpose! col-major -> row-major, -def get_durations(file_or_fd): - - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - durations = [] - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - binary = fd.read(2).decode() - if binary == '\0B' : - header = fd.read(3).decode() - # 'FM', 'DM', 'CM', 'CM2', 'CM3' are possible values, - if (header == 'CM '): - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - offset = fd.tell() - fd.seek( offset + cols*8 + rows*cols ) # 8 is the size of the column header - else: - raise NotImplementedError("Only compressed data in format CM is supported.") - else: - raise NotImplementedError("Text data not supported.") - durations.append( rows ) - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - return durations +def _read_range_slice(fd, rows, cols, dtype, range_slice=None): -def get_durations_file_list( rxfile ): + if (dtype == 'float32'): sample_size=4 + elif (dtype == 'float64'): sample_size=8 + elif (dtype == 'uint8'): sample_size=1 + else: raise UnsupportedDataType("Data type was %s" % str(dtype)) + + # Find the start and end indices etc. + if range_slice is None: range_slice = (slice(None,None,None),slice(None,None,None)) + # + start_row = 0 if range_slice[0].start is None else range_slice[0].start + end_row = rows if range_slice[0].stop is None else range_slice[0].stop + rows_to_read = end_row - start_row + # + if (len(range_slice)==2): + start_col = 0 if range_slice[1].start is None else range_slice[1].start + end_col = cols if range_slice[1].stop is None else range_slice[1].stop + else: + start_col = 0 + end_col = cols + + # We want to read the data using as few seek as possible. So the procedure will + # be different depending on the properties of range_slice + + if (start_col == 0 and end_col == cols): + # In this case we can read consequtively + if fd.seekable(): + header_offset = fd.tell() + fd.seek(header_offset + start_row*cols*sample_size ) + else: + # In this case the input is pipe and there should be no offset + assert (start_row ==0), ("Start row is %s but should be 0 for non-seekable data" %str(start_row)) + + buf = fd.read(rows_to_read * cols * sample_size) + vec = np.frombuffer(buf, dtype=dtype) + mat = np.reshape(vec,(rows_to_read,cols)) + else: + # In this case we need to read at different places + assert fd.seekable(), ("fd %str(fd)is not seekable" % str(fd) ) # Again, this should not happend for pipes + header_offset = fd.tell() + cols_to_read = end_col - start_col + mat = np.zeros((rows_to_read, cols_to_read), dtype=dtype) + for r in range(start_row, end_row): + fd.seek(header_offset + (r*cols + start_col)*sample_size ) + d = fd.read(cols_to_read*sample_size) + mat[r-start_row,:] = (np.frombuffer(d, dtype=dtype, count=cols_to_read)) - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - durations = [] - for rxf in rxfile: - try: - fd = open_or_fd(rxf) - binary = fd.read(2).decode() - if binary == '\0B' : - header = fd.read(3).decode() - # 'FM', 'DM', 'CM', 'CM2', 'CM3' are possible values, - if (header == 'CM '): - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - offset = fd.tell() - fd.seek( offset + cols*8 + rows*cols ) # 8 is the size of the column header - else: - raise NotImplementedError("Only compressed data in format CM is supported.") - else: - raise NotImplementedError("Text data not supported.") - durations.append( rows ) - fd.close() - except: - print("An exception occurred when reading %s" %rxf) - sys.exit(-1) - return durations + return mat # Writing, diff --git a/tests/data/feats.scp b/tests/data/feats.scp new file mode 100644 index 0000000..8b2ec58 --- /dev/null +++ b/tests/data/feats.scp @@ -0,0 +1,6 @@ +AMI_ES2011a_H00_FEE041_0003427_0003714 tests/data/feats.ark:39 +AMI_ES2011a_H00_FEE041_0003714_0003915 tests/data/feats.ark:14913 +AMI_ES2011a_H00_FEE041_0003714_0003915_slice1 tests/data/feats.ark:14913[:] +AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[0:5] +AMI_ES2011a_H00_FEE041_0003714_0003915_slice3 tests/data/feats.ark:14913[:,7:13] +AMI_ES2011a_H00_FEE041_0003714_0003915_slice4 tests/data/feats.ark:14913[20:30,7:13] diff --git a/tests/data/feats_ascii.scp b/tests/data/feats_ascii.scp index 2f88d8e..c338a9f 100644 --- a/tests/data/feats_ascii.scp +++ b/tests/data/feats_ascii.scp @@ -2,5 +2,5 @@ AMI_ES2011a_H00_FEE041_0003427_0003714 tests/data/feats_ascii.ark:39 AMI_ES2011a_H00_FEE041_0003714_0003915 tests/data/feats_ascii.ark:35877 AMI_ES2011a_H00_FEE041_0003714_0003915_slice1 tests/data/feats_ascii.ark:35877[:] AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats_ascii.ark:35877[0:5] -AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats_ascii.ark:35877[:,7:13] -AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats_ascii.ark:35877[20:30,7:13] +AMI_ES2011a_H00_FEE041_0003714_0003915_slice3 tests/data/feats_ascii.ark:35877[:,7:13] +AMI_ES2011a_H00_FEE041_0003714_0003915_slice4 tests/data/feats_ascii.ark:35877[20:30,7:13] diff --git a/tests/data/feats_compressed.scp b/tests/data/feats_compressed.scp new file mode 100644 index 0000000..eb9d115 --- /dev/null +++ b/tests/data/feats_compressed.scp @@ -0,0 +1,6 @@ +AMI_ES2011a_H00_FEE041_0003427_0003714 tests/data/feats_compressed.ark:39 +AMI_ES2011a_H00_FEE041_0003714_0003915 tests/data/feats_compressed.ark:3908 +AMI_ES2011a_H00_FEE041_0003714_0003915_slice1 tests/data/feats_compressed.ark:3908[:] +AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats_compressed.ark:3908[0:5] +AMI_ES2011a_H00_FEE041_0003714_0003915_slice3 tests/data/feats_compressed.ark:3908[:,7:13] +AMI_ES2011a_H00_FEE041_0003714_0003915_slice4 tests/data/feats_compressed.ark:3908[20:30,7:13] \ No newline at end of file diff --git a/tests/test_kaldi_io.py b/tests/test_kaldi_io.py index e02adda..627df16 100644 --- a/tests/test_kaldi_io.py +++ b/tests/test_kaldi_io.py @@ -47,13 +47,17 @@ def testMatrixReadWrite(self): # read, flt_mat = { k:m for k,m in kaldi_io.read_mat_scp('tests/data/feats_ascii.scp') } # ascii-scp, flt_mat2 = { k:m for k,m in kaldi_io.read_mat_ark('tests/data/feats_ascii.ark') } # ascii-ark, - flt_mat3 = { k:m for k,m in kaldi_io.read_mat_ark('tests/data/feats.ark') } # ascii-ark, + flt_mat3 = { k:m for k,m in kaldi_io.read_mat_scp('tests/data/feats.scp') } # scp for binary, + flt_mat4 = { k:m for k,m in kaldi_io.read_mat_ark('tests/data/feats.ark') } # binary-ark, + flt_mat5 = { k:m for k,m in kaldi_io.read_mat_scp('tests/data/feats_compressed.scp') } # scp for compressed binary, + flt_mat6 = { k:m for k,m in kaldi_io.read_mat_ark('tests/data/feats_compressed.ark') } # compressed binary-ark, + # store, with kaldi_io.open_or_fd('tests/data_re-saved/mat.ark','wb') as f: - for k,m in flt_mat3.items(): kaldi_io.write_mat(f, m, k) + for k,m in flt_mat6.items(): kaldi_io.write_mat(f, m, k) # read and compare, for k,m in kaldi_io.read_mat_ark('tests/data_re-saved/mat.ark'): - self.assertTrue(np.array_equal(m, flt_mat3[k]), msg="flt. matrix same after re-saving") + self.assertTrue(np.array_equal(m, flt_mat6[k]), msg="flt. matrix same after re-saving") def testPipeReadWrite(self): """ @@ -77,6 +81,24 @@ def testPipeReadWrite(self): flt_vec4 = { k:v for k,v in kaldi_io.read_vec_flt_ark('ark:copy-vector ark:tests/data/conf.ark ark:- |') } + def testListScp(self): + print ("list scp") + scp = ["AMI_ES2011a_H00_FEE041_0003427_0003714 tests/data/feats.ark:39", + "AMI_ES2011a_H00_FEE041_0003714_0003915 tests/data/feats.ark:14913", + "AMI_ES2011a_H00_FEE041_0003714_0003915_slice1 tests/data/feats.ark:14913[:]", + "AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[0:5]", + "AMI_ES2011a_H00_FEE041_0003714_0003915_slice3 tests/data/feats.ark:14913[:,7:13]", + "AMI_ES2011a_H00_FEE041_0003714_0003915_slice4 tests/data/feats.ark:14913[20:30,7:13]"] + + flt_mat = { k:m for k,m in kaldi_io.read_mat_scp(scp) } # ascii-scp, + + # store, + with kaldi_io.open_or_fd('tests/data_re-saved/mat_listScp.ark','wb') as f: + for k,m in flt_mat.items(): kaldi_io.write_mat(f, m, k) + # read and compare, + for k,m in kaldi_io.read_mat_ark('tests/data_re-saved/mat_listScp.ark'): + self.assertTrue(np.array_equal(m, flt_mat[k]), msg="flt. matrix same after re-saving") + class PosteriorIOTest(unittest.TestCase): def testWriteReadPosteriors(self): data = [[(0, 0.0), (1, 0.1), (2, 0.2)], From c9fb6c9b1a387495f3faf7de624493fa27d52de8 Mon Sep 17 00:00:00 2001 From: Rohdin Johan A Date: Fri, 3 Apr 2020 17:34:33 +0200 Subject: [PATCH 3/5] Added comments about slicing concerns. --- kaldi_io/kaldi_io.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/kaldi_io/kaldi_io.py b/kaldi_io/kaldi_io.py index a53fe95..aab12a1 100755 --- a/kaldi_io/kaldi_io.py +++ b/kaldi_io/kaldi_io.py @@ -556,14 +556,13 @@ def _read_range_slice(fd, rows, cols, dtype, range_slice=None): end_col = cols # We want to read the data using as few seek as possible. So the procedure will - # be different depending on the properties of range_slice - + # be different depending on the properties of range_slice if (start_col == 0 and end_col == cols): # In this case we can read consequtively - if fd.seekable(): - header_offset = fd.tell() - fd.seek(header_offset + start_row*cols*sample_size ) - else: + if fd.seekable(): # Comment 1: We only only read slices on seekable input. This should be + header_offset = fd.tell() # every case except piped input, right? And there is no way that piped + fd.seek(header_offset + start_row*cols*sample_size ) # could come with slice information since slice info is provided in scp. + else: # In this case the input is pipe and there should be no offset assert (start_row ==0), ("Start row is %s but should be 0 for non-seekable data" %str(start_row)) @@ -577,10 +576,18 @@ def _read_range_slice(fd, rows, cols, dtype, range_slice=None): cols_to_read = end_col - start_col mat = np.zeros((rows_to_read, cols_to_read), dtype=dtype) for r in range(start_row, end_row): - fd.seek(header_offset + (r*cols + start_col)*sample_size ) + fd.seek( header_offset + (r*cols + start_col)*sample_size ) d = fd.read(cols_to_read*sample_size) mat[r-start_row,:] = (np.frombuffer(d, dtype=dtype, count=cols_to_read)) + # Comment 2: Currently it is not supported to provide slice info via "read_mat_ark" + # If we want to extend it so it takes slice info as input i.e. + # read_mat_ark(file_or_fd, list_of_row_slices, list_of_col_slices) where "list_of_row_slices" + # contains one slice per key in the ark file we have to add something like this here: + # # Seek to the next key + # fd.seek( header_offset + (end_row*cols + start_col)*sample_size ) + # to make sure that we are at the start of the next key after data reading is done. + return mat From e5251e2677afd70225eae8d07b685788ba24d502 Mon Sep 17 00:00:00 2001 From: Rohdin Johan A Date: Fri, 3 Apr 2020 18:19:46 +0200 Subject: [PATCH 4/5] Fixed README formating --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 40235e8..9d77f53 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,9 @@ kaldi-io-for-python - Matrix (float, double) - Posterior (posteriors, nnet1 training targets, confusion networks, ...) -#### Sclicing +#### Scicing scp files can contain entries like -"AMI_ES2011a_H00_FEE041_0003714_0003915_slice5 tests/data/feats.ark:14913[2:4,3:6]" +``` AMI_ES2011a_H00_FEE041_0003714_0003915_slice5 tests/data/feats.ark:14913[2:4,3:6]``` which in this case that row 2 and 3 and columns 3,4,5 are selected. For binary data with/without compression, only the relevant elements will be read from disk. For ASCII data, all data will be read and then sliced. Currently, only a step size of 1 is @@ -46,8 +46,10 @@ with kaldi_io.open_or_fd(ark_scp_output,'wb') as f: ### Reading scp in form of a list It is als possible to read data using an "scp" stored in a python list of the form +``` ["AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[:,7:13]", "AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[20:30,7:13]"] +``` #### Install - from pypi: `python -m pip --user install kaldi_io` From 43200380a9a12a3ccb327fe07f0607a3a518cb58 Mon Sep 17 00:00:00 2001 From: Rohdin Johan A Date: Fri, 3 Apr 2020 18:21:42 +0200 Subject: [PATCH 5/5] Fixed README formating again --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9d77f53..517cd09 100644 --- a/README.md +++ b/README.md @@ -44,8 +44,8 @@ with kaldi_io.open_or_fd(ark_scp_output,'wb') as f: kaldi_io.write_mat(f, mat, key=key) ``` -### Reading scp in form of a list -It is als possible to read data using an "scp" stored in a python list of the form +###### Reading scp in form of a list +It is also possible to read data using an "scp" stored in a python list of the form ``` ["AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[:,7:13]", "AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[20:30,7:13]"]