diff --git a/README.md b/README.md index 9da7084..517cd09 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,14 @@ kaldi-io-for-python - Matrix (float, double) - Posterior (posteriors, nnet1 training targets, confusion networks, ...) +#### Scicing +scp files can contain entries like +``` AMI_ES2011a_H00_FEE041_0003714_0003915_slice5 tests/data/feats.ark:14913[2:4,3:6]``` +which in this case that row 2 and 3 and columns 3,4,5 are selected. For binary data +with/without compression, only the relevant elements will be read from disk. For ASCII +data, all data will be read and then sliced. Currently, only a step size of 1 is +supported. + #### Examples ###### Reading feature scp example: @@ -36,6 +44,13 @@ with kaldi_io.open_or_fd(ark_scp_output,'wb') as f: kaldi_io.write_mat(f, mat, key=key) ``` +###### Reading scp in form of a list +It is also possible to read data using an "scp" stored in a python list of the form +``` +["AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[:,7:13]", + "AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[20:30,7:13]"] +``` + #### Install - from pypi: `python -m pip --user install kaldi_io` - from sources: diff --git a/kaldi_io/kaldi_io.py b/kaldi_io/kaldi_io.py index aaabb7b..8db5ed5 100755 --- a/kaldi_io/kaldi_io.py +++ b/kaldi_io/kaldi_io.py @@ -8,7 +8,7 @@ from __future__ import division import numpy as np -import sys, os, re, gzip, struct +import sys, os, re, gzip, struct, io ################################################# # Adding 'kaldi binaries' to shell path, @@ -354,19 +354,30 @@ def read_mat_scp(file_or_fd): Read scp to a 'dictionary': d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } + + The scp can also be in a list of the form + ["AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[:,7:13]", + "AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[20:30,7:13]"] + """ - fd = open_or_fd(file_or_fd) + if isinstance(file_or_fd, list): fd = file_or_fd + else: fd = open_or_fd(file_or_fd) + try: for line in fd: - (key, rxfile) = line.decode().split(' ') + + if isinstance(line, str): (key, rxfile) = line.split(' ') + else: (key, rxfile) = line.decode().split(' ') + (rxfile, range_slice) = _strip_mat_range(rxfile) - # TODO, this reads whole file, and then selects the range. - # A faster solution would be to change API of read_mat() and load just the frames we need... - mat = read_mat(rxfile) - if range_slice is not None: mat = (mat[range_slice]).copy() # apply the range_slice, - # - + if range_slice is not None: + if ( (range_slice[0].step != None) or (len(range_slice)==2 and (range_slice[1].step != None)) ): + raise NotImplementedError("Step other than 1 in slices is currently not supported.") + mat = read_mat(rxfile, range_slice) + else: + mat = read_mat(rxfile) + yield key, mat finally: if fd is not file_or_fd : fd.close() @@ -430,7 +441,8 @@ def _strip_mat_range(rxfile_with_range): return (rxfile, tuple(slice_arr)) -def read_mat(file_or_fd): + +def read_mat(file_or_fd, range_slice=None): """ [mat] = read_mat(file_or_fd) Reads single kaldi matrix, supports ascii and binary. file_or_fd : file, gzipped file, pipe or opened file descriptor. @@ -439,32 +451,29 @@ def read_mat(file_or_fd): try: binary = fd.read(2).decode() if binary == '\0B' : - mat = _read_mat_binary(fd) + mat = _read_mat_binary(fd, range_slice) else: - assert(binary == ' [') mat = _read_mat_ascii(fd) + if range_slice is not None: mat = (mat[range_slice]).copy() + finally: if fd is not file_or_fd: fd.close() return mat -def _read_mat_binary(fd): + +def _read_mat_binary(fd, range_slice=None): # Data type header = fd.read(3).decode() # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles + if header.startswith('CM'): return _read_compressed_mat(fd, header, range_slice) + elif header == 'FM ': floatX ='float32' # floats + elif header == 'DM ': floatX = 'float64' # doubles else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) + # Dimensions s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat + return _read_range_slice(fd, rows, cols, floatX, range_slice=range_slice) + def _read_mat_ascii(fd): rows = [] @@ -481,13 +490,13 @@ def _read_mat_ascii(fd): return mat -def _read_compressed_mat(fd, format): +def _read_compressed_mat(fd, format, range_slice): """ Read a compressed matrix, see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), """ assert(format == 'CM ') # The formats CM2, CM3 are not supported... - + # Format of header 'struct', global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) @@ -495,17 +504,25 @@ def _read_compressed_mat(fd, format): # Read global header, globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] + # Standardize range_slice + if range_slice is None: range_slice = (slice(None,None,None),slice(None,None,None)) + elif len(range_slice) ==1 : range_slice = (range_slice[0],slice(None,None,None)) + # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) + col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols)[range_slice[1]] col_headers = np.array([np.array([x for x in y]) * globrange * 1.52590218966964e-05 + globmin for y in col_headers], dtype=np.float32) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - mat = np.zeros((cols,rows), dtype='float32') + # Note that, contrary to standard matrices, the compressed matrices are column-major + # so we need to flip rows and colums when using the below function for reading. + data = _read_range_slice(fd, cols, rows, 'uint8', range_slice=(range_slice[1],range_slice[0])) + + mat = np.zeros_like(data, dtype='float32') p0 = col_headers[:, 0].reshape(-1, 1) p25 = col_headers[:, 1].reshape(-1, 1) p75 = col_headers[:, 2].reshape(-1, 1) p100 = col_headers[:, 3].reshape(-1, 1) + mask_0_64 = (data <= 64) mask_193_255 = (data > 192) mask_65_192 = (~(mask_0_64 | mask_193_255)) @@ -517,6 +534,63 @@ def _read_compressed_mat(fd, format): return mat.T # transpose! col-major -> row-major, +def _read_range_slice(fd, rows, cols, dtype, range_slice=None): + + if (dtype == 'float32'): sample_size=4 + elif (dtype == 'float64'): sample_size=8 + elif (dtype == 'uint8'): sample_size=1 + else: raise UnsupportedDataType("Data type was %s" % str(dtype)) + + # Find the start and end indices etc. + if range_slice is None: range_slice = (slice(None,None,None),slice(None,None,None)) + # + start_row = 0 if range_slice[0].start is None else range_slice[0].start + end_row = rows if range_slice[0].stop is None else range_slice[0].stop + rows_to_read = end_row - start_row + # + if (len(range_slice)==2): + start_col = 0 if range_slice[1].start is None else range_slice[1].start + end_col = cols if range_slice[1].stop is None else range_slice[1].stop + else: + start_col = 0 + end_col = cols + + # We want to read the data using as few seek as possible. So the procedure will + # be different depending on the properties of range_slice + if (start_col == 0 and end_col == cols): + # In this case we can read consequtively + if fd.seekable(): # Comment 1: We only only read slices on seekable input. This should be + header_offset = fd.tell() # every case except piped input, right? And there is no way that piped + fd.seek(header_offset + start_row*cols*sample_size ) # could come with slice information since slice info is provided in scp. + else: + # In this case the input is pipe and there should be no offset + assert (start_row ==0), ("Start row is %s but should be 0 for non-seekable data" %str(start_row)) + + buf = fd.read(rows_to_read * cols * sample_size) + vec = np.frombuffer(buf, dtype=dtype) + mat = np.reshape(vec,(rows_to_read,cols)) + else: + # In this case we need to read at different places + assert fd.seekable(), ("fd %str(fd)is not seekable" % str(fd) ) # Again, this should not happend for pipes + header_offset = fd.tell() + cols_to_read = end_col - start_col + mat = np.zeros((rows_to_read, cols_to_read), dtype=dtype) + for r in range(start_row, end_row): + fd.seek( header_offset + (r*cols + start_col)*sample_size ) + d = fd.read(cols_to_read*sample_size) + mat[r-start_row,:] = (np.frombuffer(d, dtype=dtype, count=cols_to_read)) + + # Comment 2: Currently it is not supported to provide slice info via "read_mat_ark" + # If we want to extend it so it takes slice info as input i.e. + # read_mat_ark(file_or_fd, list_of_row_slices, list_of_col_slices) where "list_of_row_slices" + # contains one slice per key in the ark file we have to add something like this here: + # # Seek to the next key + # fd.seek( header_offset + (end_row*cols + start_col)*sample_size ) + # to make sure that we are at the start of the next key after data reading is done. + + return mat + + # Writing, def write_mat(file_or_fd, m, key=''): """ write_mat(f, m, key='') diff --git a/tests/data/feats.scp b/tests/data/feats.scp new file mode 100644 index 0000000..8b2ec58 --- /dev/null +++ b/tests/data/feats.scp @@ -0,0 +1,6 @@ +AMI_ES2011a_H00_FEE041_0003427_0003714 tests/data/feats.ark:39 +AMI_ES2011a_H00_FEE041_0003714_0003915 tests/data/feats.ark:14913 +AMI_ES2011a_H00_FEE041_0003714_0003915_slice1 tests/data/feats.ark:14913[:] +AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[0:5] +AMI_ES2011a_H00_FEE041_0003714_0003915_slice3 tests/data/feats.ark:14913[:,7:13] +AMI_ES2011a_H00_FEE041_0003714_0003915_slice4 tests/data/feats.ark:14913[20:30,7:13] diff --git a/tests/data/feats_ascii.scp b/tests/data/feats_ascii.scp index 2f88d8e..c338a9f 100644 --- a/tests/data/feats_ascii.scp +++ b/tests/data/feats_ascii.scp @@ -2,5 +2,5 @@ AMI_ES2011a_H00_FEE041_0003427_0003714 tests/data/feats_ascii.ark:39 AMI_ES2011a_H00_FEE041_0003714_0003915 tests/data/feats_ascii.ark:35877 AMI_ES2011a_H00_FEE041_0003714_0003915_slice1 tests/data/feats_ascii.ark:35877[:] AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats_ascii.ark:35877[0:5] -AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats_ascii.ark:35877[:,7:13] -AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats_ascii.ark:35877[20:30,7:13] +AMI_ES2011a_H00_FEE041_0003714_0003915_slice3 tests/data/feats_ascii.ark:35877[:,7:13] +AMI_ES2011a_H00_FEE041_0003714_0003915_slice4 tests/data/feats_ascii.ark:35877[20:30,7:13] diff --git a/tests/data/feats_compressed.scp b/tests/data/feats_compressed.scp new file mode 100644 index 0000000..eb9d115 --- /dev/null +++ b/tests/data/feats_compressed.scp @@ -0,0 +1,6 @@ +AMI_ES2011a_H00_FEE041_0003427_0003714 tests/data/feats_compressed.ark:39 +AMI_ES2011a_H00_FEE041_0003714_0003915 tests/data/feats_compressed.ark:3908 +AMI_ES2011a_H00_FEE041_0003714_0003915_slice1 tests/data/feats_compressed.ark:3908[:] +AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats_compressed.ark:3908[0:5] +AMI_ES2011a_H00_FEE041_0003714_0003915_slice3 tests/data/feats_compressed.ark:3908[:,7:13] +AMI_ES2011a_H00_FEE041_0003714_0003915_slice4 tests/data/feats_compressed.ark:3908[20:30,7:13] \ No newline at end of file diff --git a/tests/test_kaldi_io.py b/tests/test_kaldi_io.py index e02adda..627df16 100644 --- a/tests/test_kaldi_io.py +++ b/tests/test_kaldi_io.py @@ -47,13 +47,17 @@ def testMatrixReadWrite(self): # read, flt_mat = { k:m for k,m in kaldi_io.read_mat_scp('tests/data/feats_ascii.scp') } # ascii-scp, flt_mat2 = { k:m for k,m in kaldi_io.read_mat_ark('tests/data/feats_ascii.ark') } # ascii-ark, - flt_mat3 = { k:m for k,m in kaldi_io.read_mat_ark('tests/data/feats.ark') } # ascii-ark, + flt_mat3 = { k:m for k,m in kaldi_io.read_mat_scp('tests/data/feats.scp') } # scp for binary, + flt_mat4 = { k:m for k,m in kaldi_io.read_mat_ark('tests/data/feats.ark') } # binary-ark, + flt_mat5 = { k:m for k,m in kaldi_io.read_mat_scp('tests/data/feats_compressed.scp') } # scp for compressed binary, + flt_mat6 = { k:m for k,m in kaldi_io.read_mat_ark('tests/data/feats_compressed.ark') } # compressed binary-ark, + # store, with kaldi_io.open_or_fd('tests/data_re-saved/mat.ark','wb') as f: - for k,m in flt_mat3.items(): kaldi_io.write_mat(f, m, k) + for k,m in flt_mat6.items(): kaldi_io.write_mat(f, m, k) # read and compare, for k,m in kaldi_io.read_mat_ark('tests/data_re-saved/mat.ark'): - self.assertTrue(np.array_equal(m, flt_mat3[k]), msg="flt. matrix same after re-saving") + self.assertTrue(np.array_equal(m, flt_mat6[k]), msg="flt. matrix same after re-saving") def testPipeReadWrite(self): """ @@ -77,6 +81,24 @@ def testPipeReadWrite(self): flt_vec4 = { k:v for k,v in kaldi_io.read_vec_flt_ark('ark:copy-vector ark:tests/data/conf.ark ark:- |') } + def testListScp(self): + print ("list scp") + scp = ["AMI_ES2011a_H00_FEE041_0003427_0003714 tests/data/feats.ark:39", + "AMI_ES2011a_H00_FEE041_0003714_0003915 tests/data/feats.ark:14913", + "AMI_ES2011a_H00_FEE041_0003714_0003915_slice1 tests/data/feats.ark:14913[:]", + "AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[0:5]", + "AMI_ES2011a_H00_FEE041_0003714_0003915_slice3 tests/data/feats.ark:14913[:,7:13]", + "AMI_ES2011a_H00_FEE041_0003714_0003915_slice4 tests/data/feats.ark:14913[20:30,7:13]"] + + flt_mat = { k:m for k,m in kaldi_io.read_mat_scp(scp) } # ascii-scp, + + # store, + with kaldi_io.open_or_fd('tests/data_re-saved/mat_listScp.ark','wb') as f: + for k,m in flt_mat.items(): kaldi_io.write_mat(f, m, k) + # read and compare, + for k,m in kaldi_io.read_mat_ark('tests/data_re-saved/mat_listScp.ark'): + self.assertTrue(np.array_equal(m, flt_mat[k]), msg="flt. matrix same after re-saving") + class PosteriorIOTest(unittest.TestCase): def testWriteReadPosteriors(self): data = [[(0, 0.0), (1, 0.1), (2, 0.2)],