From 9b1e218de2fc063292cc3719cc2c1f084cb7ebec Mon Sep 17 00:00:00 2001
From: Rohdin Johan A <rohdin@fit.vutbr.cz>
Date: Mon, 24 Feb 2020 15:09:11 +0100
Subject: [PATCH 1/5] Added functions for reading feature chunks.

---
 kaldi_io/kaldi_io.py | 177 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 177 insertions(+)

diff --git a/kaldi_io/kaldi_io.py b/kaldi_io/kaldi_io.py
index 498f9a8..eee566e 100755
--- a/kaldi_io/kaldi_io.py
+++ b/kaldi_io/kaldi_io.py
@@ -409,6 +409,7 @@ def _read_mat_binary(fd):
     mat = np.reshape(vec,(rows,cols))
     return mat
 
+
 def _read_mat_ascii(fd):
     rows = []
     while 1:
@@ -460,6 +461,182 @@ def _read_compressed_mat(fd, format):
     return mat.T # transpose! col-major -> row-major,
 
 
+###
+def read_file_segm(rxfile, start, end):
+
+    assert( len(rxfile)==len(start)==len(end) )
+
+    data = []
+    try:
+        for i,rxf in enumerate( rxfile ):
+            data.append( read_mat_n(rxf, start[i], end[i]) )
+    except:
+          print("An exception occurred")
+    return data 
+                
+def read_mat_ark_n(file_or_fd, start, end):
+    """ generator(key,mat) = read_mat_ark(file_or_fd)
+     Returns generator of (key,matrix) tuples, read from ark file/stream.
+     file_or_fd : scp, gzipped scp, pipe or opened file descriptor.
+
+     Iterate the ark:
+     for key,mat in kaldi_io.read_mat_ark(file):
+         ...
+
+     Read ark to a 'dictionary':
+     d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) }
+    """
+    fd = open_or_fd(file_or_fd)
+    try:
+        key = read_key(fd)
+        while key:
+            mat = read_mat_n(fd, start, end)
+            yield key, mat
+            key = read_key(fd)
+    finally:
+        if fd is not file_or_fd : fd.close()
+
+
+def read_mat_n(file_or_fd, start, end):
+    """ [mat] = read_mat(file_or_fd)
+     Reads single kaldi matrix, supports ascii and binary.
+     file_or_fd : file, gzipped file, pipe or opened file descriptor.
+    """
+    fd = open_or_fd(file_or_fd)
+    try:
+        binary = fd.read(2).decode()
+        if binary == '\0B' :
+            mat = _read_mat_binary_n(fd, start, end)
+        else:
+            assert(binary == ' [')
+            mat = _read_mat_ascii(fd, start, end)
+    finally:
+        if fd is not file_or_fd: fd.close()
+    return mat
+
+def _read_mat_binary_n(fd, start, end):
+    # Data type
+    header = fd.read(3).decode()
+    # 'CM', 'CM2', 'CM3' are possible values,
+    if header.startswith('CM'): return _read_compressed_mat_n(fd, header, start, end)
+    else: raise NotImplementedError("Binary data without compression is not supported:")
+    assert(sample_size > 0)
+    # Dimensions
+    s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0]
+    # Read whole matrix
+    rows=n
+    buf = fd.read(rows * cols * sample_size)
+    if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32')
+    elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64')
+    else : raise BadSampleSize
+    mat = np.reshape(vec,(rows,cols))
+    return mat
+
+
+def _read_mat_ascii_n(fd, start, end):
+    raise NotImplementedError("Text data not supported.")
+
+
+def _read_compressed_mat_n(fd, format, start, end):
+    """ Read a compressed matrix,
+        see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h
+        methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...),
+    """
+    assert(format == 'CM ') # The formats CM2, CM3 are not supported...
+
+    # Format of header 'struct',
+    global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written,
+    per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')])
+
+    # Read global header,
+    globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0]
+    rows_to_read = end - start
+    
+    # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ]
+    #                                                 {                     cols                     }{         size                 }
+    col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols)
+    col_headers = np.array([np.array([x for x in y]) * globrange * 1.52590218966964e-05 + globmin for y in col_headers], dtype=np.float32)
+    data = np.zeros((cols, rows_to_read), dtype='uint8')
+    header_offset = fd.tell()
+    for c in range(cols):
+        fd.seek((header_offset + rows*(c) +start) )
+        data[c,:] = np.frombuffer(fd.read(rows_to_read), dtype='uint8', count=rows_to_read)
+
+    # Seek to the next key    
+    fd.seek((header_offset + rows*(c+1) ) )
+
+    mat = np.zeros((cols,rows_to_read), dtype='float32')
+    p0 = col_headers[:, 0].reshape(-1, 1)
+    p25 = col_headers[:, 1].reshape(-1, 1)
+    p75 = col_headers[:, 2].reshape(-1, 1)
+    p100 = col_headers[:, 3].reshape(-1, 1)
+
+    mask_0_64 = (data <= 64)
+    mask_193_255 = (data > 192)
+    mask_65_192 = (~(mask_0_64 | mask_193_255))
+
+    mat += (p0    + (p25 - p0) / 64. * data) * mask_0_64.astype(np.float32)
+    mat += (p25 + (p75 - p25) / 128. * (data - 64)) * mask_65_192.astype(np.float32)
+    mat += (p75 + (p100 - p75) / 63. * (data - 192)) * mask_193_255.astype(np.float32)
+
+    return mat.T # transpose! col-major -> row-major,
+
+
+def get_durations(file_or_fd):
+
+    global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written,
+    durations = []
+    fd = open_or_fd(file_or_fd)
+    try:
+        key = read_key(fd)
+        while key:
+            binary = fd.read(2).decode()
+            if binary == '\0B' :
+                header = fd.read(3).decode()
+                # 'FM', 'DM', 'CM', 'CM2', 'CM3' are possible values,
+                if (header == 'CM '): 
+                    # Read global header,
+                    globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0]
+                    offset = fd.tell()
+                    fd.seek( offset + cols*8 + rows*cols  ) # 8 is the size of the column header                     
+                else:
+                    raise NotImplementedError("Only compressed data in format CM is supported.")
+            else:
+                raise NotImplementedError("Text data not supported.")
+            durations.append( rows )
+            key = read_key(fd)        
+    finally:
+        if fd is not file_or_fd : fd.close()
+    return durations
+
+def get_durations_file_list( rxfile ):
+
+    global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written,
+    durations = []
+    for rxf in rxfile:
+        try:
+            fd = open_or_fd(rxf) 
+            binary = fd.read(2).decode()
+            if binary == '\0B' :
+                header = fd.read(3).decode()
+                # 'FM', 'DM', 'CM', 'CM2', 'CM3' are possible values,
+                if (header == 'CM '): 
+                    # Read global header,
+                    globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0]
+                    offset = fd.tell()
+                    fd.seek( offset + cols*8 + rows*cols  ) # 8 is the size of the column header                     
+                else:
+                    raise NotImplementedError("Only compressed data in format CM is supported.")
+            else:
+                raise NotImplementedError("Text data not supported.")
+            durations.append( rows )
+            fd.close()
+        except:
+            print("An exception occurred when reading %s" %rxf)
+            sys.exit(-1)
+    return durations
+
+    
 # Writing,
 def write_mat(file_or_fd, m, key=''):
     """ write_mat(f, m, key='')

From b781084ed025ff30c7ea98d98a1a0e91ed281d93 Mon Sep 17 00:00:00 2001
From: Rohdin Johan A <rohdin@fit.vutbr.cz>
Date: Fri, 3 Apr 2020 16:02:39 +0200
Subject: [PATCH 2/5] Code for efficient reading of sliced features. Option to
 read data from an scp lines provided in a python list.

---
 README.md                       |  13 ++
 kaldi_io/kaldi_io.py            | 288 ++++++++++----------------------
 tests/data/feats.scp            |   6 +
 tests/data/feats_ascii.scp      |   4 +-
 tests/data/feats_compressed.scp |   6 +
 tests/test_kaldi_io.py          |  28 +++-
 6 files changed, 141 insertions(+), 204 deletions(-)
 create mode 100644 tests/data/feats.scp
 create mode 100644 tests/data/feats_compressed.scp

diff --git a/README.md b/README.md
index 9da7084..40235e8 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,14 @@ kaldi-io-for-python
 - Matrix (float, double)
 - Posterior (posteriors, nnet1 training targets, confusion networks, ...)
 
+#### Sclicing
+scp files can contain entries like
+"AMI_ES2011a_H00_FEE041_0003714_0003915_slice5 tests/data/feats.ark:14913[2:4,3:6]"
+which in this case that row 2 and 3 and columns 3,4,5 are selected. For binary data
+with/without compression, only the relevant elements will be read from disk. For ASCII
+data, all data will be read and then sliced. Currently, only a step size of 1 is
+supported.
+
 #### Examples
 
 ###### Reading feature scp example:
@@ -36,6 +44,11 @@ with kaldi_io.open_or_fd(ark_scp_output,'wb') as f:
     kaldi_io.write_mat(f, mat, key=key)
 ```
 
+### Reading scp in form of a list
+It is als possible to read data using an "scp" stored in a python list of the form
+["AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[:,7:13]",
+  "AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[20:30,7:13]"]
+
 #### Install
 - from pypi: `python -m pip --user install kaldi_io`
 - from sources:
diff --git a/kaldi_io/kaldi_io.py b/kaldi_io/kaldi_io.py
index f85110c..a53fe95 100755
--- a/kaldi_io/kaldi_io.py
+++ b/kaldi_io/kaldi_io.py
@@ -8,7 +8,7 @@
 from __future__ import division
 
 import numpy as np
-import sys, os, re, gzip, struct
+import sys, os, re, gzip, struct, io
 
 #################################################
 # Adding 'kaldi binaries' to shell path,
@@ -354,19 +354,30 @@ def read_mat_scp(file_or_fd):
 
      Read scp to a 'dictionary':
      d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) }
+
+    The scp can also be in a list of the form
+    ["AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[:,7:13]",
+     "AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[20:30,7:13]"]
+
     """
-    fd = open_or_fd(file_or_fd)
+    if isinstance(file_or_fd, list): fd = file_or_fd
+    else: fd = open_or_fd(file_or_fd)
+    
     try:
         for line in fd:
-            (key, rxfile) = line.decode().split(' ')
+            
+            if isinstance(line, str): (key, rxfile) = line.split(' ')
+            else: (key, rxfile) = line.decode().split(' ')
+            
             (rxfile, range_slice) = _strip_mat_range(rxfile)
 
-            # TODO, this reads whole file, and then selects the range.
-            # A faster solution would be to change API of read_mat() and load just the frames we need...
-            mat = read_mat(rxfile)
-            if range_slice is not None: mat = (mat[range_slice]).copy() # apply the range_slice,
-            #
-
+            if range_slice is not None:
+                if ( (range_slice[0].step != None) or (len(range_slice)==2 and (range_slice[1].step != None)) ):
+                    raise NotImplementedError("Step other than 1 in slices is currently not supported.")
+                mat = read_mat(rxfile, range_slice)
+            else:
+                mat = read_mat(rxfile)
+                
             yield key, mat
     finally:
         if fd is not file_or_fd : fd.close()
@@ -430,7 +441,8 @@ def _strip_mat_range(rxfile_with_range):
     return (rxfile, tuple(slice_arr))
 
 
-def read_mat(file_or_fd):
+
+def read_mat(file_or_fd, range_slice=None):
     """ [mat] = read_mat(file_or_fd)
      Reads single kaldi matrix, supports ascii and binary.
      file_or_fd : file, gzipped file, pipe or opened file descriptor.
@@ -439,32 +451,28 @@ def read_mat(file_or_fd):
     try:
         binary = fd.read(2).decode()
         if binary == '\0B' :
-            mat = _read_mat_binary(fd)
+            mat = _read_mat_binary(fd, range_slice)
         else:
-            assert(binary == ' [')
             mat = _read_mat_ascii(fd)
+            if range_slice is not None: mat = (mat[range_slice]).copy()
+            
     finally:
         if fd is not file_or_fd: fd.close()
     return mat
 
-def _read_mat_binary(fd):
+
+def _read_mat_binary(fd, range_slice=None):
     # Data type
     header = fd.read(3).decode()
     # 'CM', 'CM2', 'CM3' are possible values,
-    if header.startswith('CM'): return _read_compressed_mat(fd, header)
-    elif header == 'FM ': sample_size = 4 # floats
-    elif header == 'DM ': sample_size = 8 # doubles
+    if header.startswith('CM'): return _read_compressed_mat(fd, header, range_slice)
+    elif header == 'FM ': floatX ='float32' # floats
+    elif header == 'DM ': floatX = 'float64' # doubles
     else: raise UnknownMatrixHeader("The header contained '%s'" % header)
-    assert(sample_size > 0)
+
     # Dimensions
     s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0]
-    # Read whole matrix
-    buf = fd.read(rows * cols * sample_size)
-    if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32')
-    elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64')
-    else : raise BadSampleSize
-    mat = np.reshape(vec,(rows,cols))
-    return mat
+    return _read_range_slice(fd, rows, cols, floatX, range_slice=range_slice)
 
 
 def _read_mat_ascii(fd):
@@ -482,13 +490,13 @@ def _read_mat_ascii(fd):
             return mat
 
 
-def _read_compressed_mat(fd, format):
+def _read_compressed_mat(fd, format, range_slice):
     """ Read a compressed matrix,
         see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h
         methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...),
     """
     assert(format == 'CM ') # The formats CM2, CM3 are not supported...
-
+    
     # Format of header 'struct',
     global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written,
     per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')])
@@ -496,133 +504,20 @@ def _read_compressed_mat(fd, format):
     # Read global header,
     globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0]
 
-    # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ]
-    #                                                 {                     cols                     }{         size                 }
-    col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols)
-    col_headers = np.array([np.array([x for x in y]) * globrange * 1.52590218966964e-05 + globmin for y in col_headers], dtype=np.float32)
-    data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major,
-
-    mat = np.zeros((cols,rows), dtype='float32')
-    p0 = col_headers[:, 0].reshape(-1, 1)
-    p25 = col_headers[:, 1].reshape(-1, 1)
-    p75 = col_headers[:, 2].reshape(-1, 1)
-    p100 = col_headers[:, 3].reshape(-1, 1)
-    mask_0_64 = (data <= 64)
-    mask_193_255 = (data > 192)
-    mask_65_192 = (~(mask_0_64 | mask_193_255))
-
-    mat += (p0    + (p25 - p0) / 64. * data) * mask_0_64.astype(np.float32)
-    mat += (p25 + (p75 - p25) / 128. * (data - 64)) * mask_65_192.astype(np.float32)
-    mat += (p75 + (p100 - p75) / 63. * (data - 192)) * mask_193_255.astype(np.float32)
-
-    return mat.T # transpose! col-major -> row-major,
-
-
-###
-def read_file_segm(rxfile, start, end):
-
-    assert( len(rxfile)==len(start)==len(end) )
-
-    data = []
-    try:
-        for i,rxf in enumerate( rxfile ):
-            data.append( read_mat_n(rxf, start[i], end[i]) )
-    except:
-          print("An exception occurred")
-    return data 
-                
-def read_mat_ark_n(file_or_fd, start, end):
-    """ generator(key,mat) = read_mat_ark(file_or_fd)
-     Returns generator of (key,matrix) tuples, read from ark file/stream.
-     file_or_fd : scp, gzipped scp, pipe or opened file descriptor.
-
-     Iterate the ark:
-     for key,mat in kaldi_io.read_mat_ark(file):
-         ...
-
-     Read ark to a 'dictionary':
-     d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) }
-    """
-    fd = open_or_fd(file_or_fd)
-    try:
-        key = read_key(fd)
-        while key:
-            mat = read_mat_n(fd, start, end)
-            yield key, mat
-            key = read_key(fd)
-    finally:
-        if fd is not file_or_fd : fd.close()
-
-
-def read_mat_n(file_or_fd, start, end):
-    """ [mat] = read_mat(file_or_fd)
-     Reads single kaldi matrix, supports ascii and binary.
-     file_or_fd : file, gzipped file, pipe or opened file descriptor.
-    """
-    fd = open_or_fd(file_or_fd)
-    try:
-        binary = fd.read(2).decode()
-        if binary == '\0B' :
-            mat = _read_mat_binary_n(fd, start, end)
-        else:
-            assert(binary == ' [')
-            mat = _read_mat_ascii(fd, start, end)
-    finally:
-        if fd is not file_or_fd: fd.close()
-    return mat
-
-def _read_mat_binary_n(fd, start, end):
-    # Data type
-    header = fd.read(3).decode()
-    # 'CM', 'CM2', 'CM3' are possible values,
-    if header.startswith('CM'): return _read_compressed_mat_n(fd, header, start, end)
-    else: raise NotImplementedError("Binary data without compression is not supported:")
-    assert(sample_size > 0)
-    # Dimensions
-    s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0]
-    # Read whole matrix
-    rows=n
-    buf = fd.read(rows * cols * sample_size)
-    if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32')
-    elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64')
-    else : raise BadSampleSize
-    mat = np.reshape(vec,(rows,cols))
-    return mat
-
-
-def _read_mat_ascii_n(fd, start, end):
-    raise NotImplementedError("Text data not supported.")
-
-
-def _read_compressed_mat_n(fd, format, start, end):
-    """ Read a compressed matrix,
-        see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h
-        methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...),
-    """
-    assert(format == 'CM ') # The formats CM2, CM3 are not supported...
-
-    # Format of header 'struct',
-    global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written,
-    per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')])
-
-    # Read global header,
-    globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0]
-    rows_to_read = end - start
+    # Standardize range_slice
+    if range_slice is None: range_slice = (slice(None,None,None),slice(None,None,None))
+    elif len(range_slice) ==1 : range_slice = (range_slice[0],slice(None,None,None))
     
     # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ]
     #                                                 {                     cols                     }{         size                 }
-    col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols)
+    col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols)[range_slice[1]]
     col_headers = np.array([np.array([x for x in y]) * globrange * 1.52590218966964e-05 + globmin for y in col_headers], dtype=np.float32)
-    data = np.zeros((cols, rows_to_read), dtype='uint8')
-    header_offset = fd.tell()
-    for c in range(cols):
-        fd.seek((header_offset + rows*(c) +start) )
-        data[c,:] = np.frombuffer(fd.read(rows_to_read), dtype='uint8', count=rows_to_read)
 
-    # Seek to the next key    
-    fd.seek((header_offset + rows*(c+1) ) )
-
-    mat = np.zeros((cols,rows_to_read), dtype='float32')
+    # Note that, contrary to standard matrices, the compressed matrices are column-major
+    # so we need to flip rows and colums when using the below function for reading.
+    data = _read_range_slice(fd, cols, rows, 'uint8', range_slice=(range_slice[1],range_slice[0]))
+    
+    mat = np.zeros_like(data, dtype='float32')
     p0 = col_headers[:, 0].reshape(-1, 1)
     p25 = col_headers[:, 1].reshape(-1, 1)
     p75 = col_headers[:, 2].reshape(-1, 1)
@@ -639,59 +534,54 @@ def _read_compressed_mat_n(fd, format, start, end):
     return mat.T # transpose! col-major -> row-major,
 
 
-def get_durations(file_or_fd):
-
-    global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written,
-    durations = []
-    fd = open_or_fd(file_or_fd)
-    try:
-        key = read_key(fd)
-        while key:
-            binary = fd.read(2).decode()
-            if binary == '\0B' :
-                header = fd.read(3).decode()
-                # 'FM', 'DM', 'CM', 'CM2', 'CM3' are possible values,
-                if (header == 'CM '): 
-                    # Read global header,
-                    globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0]
-                    offset = fd.tell()
-                    fd.seek( offset + cols*8 + rows*cols  ) # 8 is the size of the column header                     
-                else:
-                    raise NotImplementedError("Only compressed data in format CM is supported.")
-            else:
-                raise NotImplementedError("Text data not supported.")
-            durations.append( rows )
-            key = read_key(fd)        
-    finally:
-        if fd is not file_or_fd : fd.close()
-    return durations
+def _read_range_slice(fd, rows, cols, dtype, range_slice=None):
 
-def get_durations_file_list( rxfile ):
+    if (dtype == 'float32'): sample_size=4
+    elif (dtype == 'float64'): sample_size=8
+    elif (dtype == 'uint8'): sample_size=1
+    else: raise UnsupportedDataType("Data type was %s" % str(dtype))
+    
+    # Find the start and end indices etc.
+    if range_slice is None: range_slice = (slice(None,None,None),slice(None,None,None))
+    #
+    start_row    = 0 if range_slice[0].start is None else range_slice[0].start
+    end_row      = rows if range_slice[0].stop is None else range_slice[0].stop
+    rows_to_read = end_row - start_row
+    #
+    if (len(range_slice)==2):
+        start_col = 0 if range_slice[1].start is None else range_slice[1].start
+        end_col   = cols if range_slice[1].stop is None else range_slice[1].stop
+    else:
+        start_col = 0
+        end_col   = cols
+        
+    # We want to read the data using as few seek as possible. So the procedure will    
+    # be different depending on the properties of range_slice
+    
+    if (start_col == 0 and end_col == cols):
+        # In this case we can read consequtively
+        if fd.seekable():
+            header_offset = fd.tell()
+            fd.seek(header_offset + start_row*cols*sample_size )
+        else:
+            # In this case the input is pipe and there should be no offset
+            assert (start_row ==0), ("Start row is %s but should be 0 for non-seekable data" %str(start_row))
+            
+        buf = fd.read(rows_to_read * cols * sample_size)
+        vec = np.frombuffer(buf, dtype=dtype)
+        mat = np.reshape(vec,(rows_to_read,cols))
+    else:
+        # In this case we need to read at different places
+        assert fd.seekable(), ("fd %str(fd)is not seekable" % str(fd) )  # Again, this should not happend for pipes
+        header_offset = fd.tell()
+        cols_to_read = end_col - start_col
+        mat = np.zeros((rows_to_read, cols_to_read), dtype=dtype)
+        for r in range(start_row, end_row):
+            fd.seek(header_offset + (r*cols + start_col)*sample_size )
+            d = fd.read(cols_to_read*sample_size)
+            mat[r-start_row,:] = (np.frombuffer(d, dtype=dtype, count=cols_to_read))
 
-    global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written,
-    durations = []
-    for rxf in rxfile:
-        try:
-            fd = open_or_fd(rxf) 
-            binary = fd.read(2).decode()
-            if binary == '\0B' :
-                header = fd.read(3).decode()
-                # 'FM', 'DM', 'CM', 'CM2', 'CM3' are possible values,
-                if (header == 'CM '): 
-                    # Read global header,
-                    globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0]
-                    offset = fd.tell()
-                    fd.seek( offset + cols*8 + rows*cols  ) # 8 is the size of the column header                     
-                else:
-                    raise NotImplementedError("Only compressed data in format CM is supported.")
-            else:
-                raise NotImplementedError("Text data not supported.")
-            durations.append( rows )
-            fd.close()
-        except:
-            print("An exception occurred when reading %s" %rxf)
-            sys.exit(-1)
-    return durations
+    return mat
 
     
 # Writing,
diff --git a/tests/data/feats.scp b/tests/data/feats.scp
new file mode 100644
index 0000000..8b2ec58
--- /dev/null
+++ b/tests/data/feats.scp
@@ -0,0 +1,6 @@
+AMI_ES2011a_H00_FEE041_0003427_0003714 tests/data/feats.ark:39
+AMI_ES2011a_H00_FEE041_0003714_0003915 tests/data/feats.ark:14913
+AMI_ES2011a_H00_FEE041_0003714_0003915_slice1 tests/data/feats.ark:14913[:]
+AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[0:5]
+AMI_ES2011a_H00_FEE041_0003714_0003915_slice3 tests/data/feats.ark:14913[:,7:13]
+AMI_ES2011a_H00_FEE041_0003714_0003915_slice4 tests/data/feats.ark:14913[20:30,7:13]
diff --git a/tests/data/feats_ascii.scp b/tests/data/feats_ascii.scp
index 2f88d8e..c338a9f 100644
--- a/tests/data/feats_ascii.scp
+++ b/tests/data/feats_ascii.scp
@@ -2,5 +2,5 @@ AMI_ES2011a_H00_FEE041_0003427_0003714 tests/data/feats_ascii.ark:39
 AMI_ES2011a_H00_FEE041_0003714_0003915 tests/data/feats_ascii.ark:35877
 AMI_ES2011a_H00_FEE041_0003714_0003915_slice1 tests/data/feats_ascii.ark:35877[:]
 AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats_ascii.ark:35877[0:5]
-AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats_ascii.ark:35877[:,7:13]
-AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats_ascii.ark:35877[20:30,7:13]
+AMI_ES2011a_H00_FEE041_0003714_0003915_slice3 tests/data/feats_ascii.ark:35877[:,7:13]
+AMI_ES2011a_H00_FEE041_0003714_0003915_slice4 tests/data/feats_ascii.ark:35877[20:30,7:13]
diff --git a/tests/data/feats_compressed.scp b/tests/data/feats_compressed.scp
new file mode 100644
index 0000000..eb9d115
--- /dev/null
+++ b/tests/data/feats_compressed.scp
@@ -0,0 +1,6 @@
+AMI_ES2011a_H00_FEE041_0003427_0003714 tests/data/feats_compressed.ark:39
+AMI_ES2011a_H00_FEE041_0003714_0003915 tests/data/feats_compressed.ark:3908
+AMI_ES2011a_H00_FEE041_0003714_0003915_slice1 tests/data/feats_compressed.ark:3908[:]
+AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats_compressed.ark:3908[0:5]
+AMI_ES2011a_H00_FEE041_0003714_0003915_slice3 tests/data/feats_compressed.ark:3908[:,7:13]
+AMI_ES2011a_H00_FEE041_0003714_0003915_slice4 tests/data/feats_compressed.ark:3908[20:30,7:13]
\ No newline at end of file
diff --git a/tests/test_kaldi_io.py b/tests/test_kaldi_io.py
index e02adda..627df16 100644
--- a/tests/test_kaldi_io.py
+++ b/tests/test_kaldi_io.py
@@ -47,13 +47,17 @@ def testMatrixReadWrite(self):
         # read,
         flt_mat = { k:m for k,m in kaldi_io.read_mat_scp('tests/data/feats_ascii.scp') } # ascii-scp,
         flt_mat2 = { k:m for k,m in kaldi_io.read_mat_ark('tests/data/feats_ascii.ark') } # ascii-ark,
-        flt_mat3 = { k:m for k,m in kaldi_io.read_mat_ark('tests/data/feats.ark') } # ascii-ark,
+        flt_mat3 = { k:m for k,m in kaldi_io.read_mat_scp('tests/data/feats.scp') } # scp for binary,
+        flt_mat4 = { k:m for k,m in kaldi_io.read_mat_ark('tests/data/feats.ark') } # binary-ark,
+        flt_mat5 = { k:m for k,m in kaldi_io.read_mat_scp('tests/data/feats_compressed.scp') } # scp for compressed binary,
+        flt_mat6 = { k:m for k,m in kaldi_io.read_mat_ark('tests/data/feats_compressed.ark') } # compressed binary-ark,
+
         # store,
         with kaldi_io.open_or_fd('tests/data_re-saved/mat.ark','wb') as f:
-            for k,m in flt_mat3.items(): kaldi_io.write_mat(f, m, k)
+            for k,m in flt_mat6.items(): kaldi_io.write_mat(f, m, k)
         # read and compare,
         for k,m in kaldi_io.read_mat_ark('tests/data_re-saved/mat.ark'):
-            self.assertTrue(np.array_equal(m, flt_mat3[k]), msg="flt. matrix same after re-saving")
+            self.assertTrue(np.array_equal(m, flt_mat6[k]), msg="flt. matrix same after re-saving")
 
     def testPipeReadWrite(self):
         """
@@ -77,6 +81,24 @@ def testPipeReadWrite(self):
             flt_vec4 = { k:v for k,v in kaldi_io.read_vec_flt_ark('ark:copy-vector ark:tests/data/conf.ark ark:- |') }
 
 
+    def testListScp(self):
+        print ("list scp")
+        scp = ["AMI_ES2011a_H00_FEE041_0003427_0003714 tests/data/feats.ark:39",
+               "AMI_ES2011a_H00_FEE041_0003714_0003915 tests/data/feats.ark:14913",
+               "AMI_ES2011a_H00_FEE041_0003714_0003915_slice1 tests/data/feats.ark:14913[:]",
+               "AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[0:5]",
+               "AMI_ES2011a_H00_FEE041_0003714_0003915_slice3 tests/data/feats.ark:14913[:,7:13]",
+               "AMI_ES2011a_H00_FEE041_0003714_0003915_slice4 tests/data/feats.ark:14913[20:30,7:13]"]
+
+        flt_mat = { k:m for k,m in kaldi_io.read_mat_scp(scp) } # ascii-scp,
+
+        # store,
+        with kaldi_io.open_or_fd('tests/data_re-saved/mat_listScp.ark','wb') as f:
+            for k,m in flt_mat.items(): kaldi_io.write_mat(f, m, k)
+        # read and compare,
+        for k,m in kaldi_io.read_mat_ark('tests/data_re-saved/mat_listScp.ark'):
+            self.assertTrue(np.array_equal(m, flt_mat[k]), msg="flt. matrix same after re-saving")
+
 class PosteriorIOTest(unittest.TestCase):
     def testWriteReadPosteriors(self):
         data = [[(0, 0.0), (1, 0.1), (2, 0.2)],

From c9fb6c9b1a387495f3faf7de624493fa27d52de8 Mon Sep 17 00:00:00 2001
From: Rohdin Johan A <rohdin@fit.vutbr.cz>
Date: Fri, 3 Apr 2020 17:34:33 +0200
Subject: [PATCH 3/5] Added comments about slicing concerns.

---
 kaldi_io/kaldi_io.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/kaldi_io/kaldi_io.py b/kaldi_io/kaldi_io.py
index a53fe95..aab12a1 100755
--- a/kaldi_io/kaldi_io.py
+++ b/kaldi_io/kaldi_io.py
@@ -556,14 +556,13 @@ def _read_range_slice(fd, rows, cols, dtype, range_slice=None):
         end_col   = cols
         
     # We want to read the data using as few seek as possible. So the procedure will    
-    # be different depending on the properties of range_slice
-    
+    # be different depending on the properties of range_slice    
     if (start_col == 0 and end_col == cols):
         # In this case we can read consequtively
-        if fd.seekable():
-            header_offset = fd.tell()
-            fd.seek(header_offset + start_row*cols*sample_size )
-        else:
+        if fd.seekable():                                           # Comment 1: We only only read slices on seekable input. This should be      
+            header_offset = fd.tell()                               # every case except piped input, right? And there is no way that piped 
+            fd.seek(header_offset + start_row*cols*sample_size )    # could come with slice information since slice info is provided in scp. 
+        else:                                                       
             # In this case the input is pipe and there should be no offset
             assert (start_row ==0), ("Start row is %s but should be 0 for non-seekable data" %str(start_row))
             
@@ -577,10 +576,18 @@ def _read_range_slice(fd, rows, cols, dtype, range_slice=None):
         cols_to_read = end_col - start_col
         mat = np.zeros((rows_to_read, cols_to_read), dtype=dtype)
         for r in range(start_row, end_row):
-            fd.seek(header_offset + (r*cols + start_col)*sample_size )
+            fd.seek( header_offset + (r*cols + start_col)*sample_size )
             d = fd.read(cols_to_read*sample_size)
             mat[r-start_row,:] = (np.frombuffer(d, dtype=dtype, count=cols_to_read))
 
+    # Comment 2: Currently it is not supported to provide slice info via "read_mat_ark"
+    # If we want to extend it so it takes slice info as input i.e.
+    # read_mat_ark(file_or_fd, list_of_row_slices, list_of_col_slices) where "list_of_row_slices"
+    # contains one slice per key in the ark file we have to add something like this here:
+    # # Seek to the next key
+    #    fd.seek( header_offset + (end_row*cols + start_col)*sample_size )
+    # to make sure that we are at the start of the next key after data reading is done.
+            
     return mat
 
     

From e5251e2677afd70225eae8d07b685788ba24d502 Mon Sep 17 00:00:00 2001
From: Rohdin Johan A <rohdin@fit.vutbr.cz>
Date: Fri, 3 Apr 2020 18:19:46 +0200
Subject: [PATCH 4/5] Fixed README formating

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 40235e8..9d77f53 100644
--- a/README.md
+++ b/README.md
@@ -9,9 +9,9 @@ kaldi-io-for-python
 - Matrix (float, double)
 - Posterior (posteriors, nnet1 training targets, confusion networks, ...)
 
-#### Sclicing
+#### Scicing
 scp files can contain entries like
-"AMI_ES2011a_H00_FEE041_0003714_0003915_slice5 tests/data/feats.ark:14913[2:4,3:6]"
+``` AMI_ES2011a_H00_FEE041_0003714_0003915_slice5 tests/data/feats.ark:14913[2:4,3:6]```
 which in this case that row 2 and 3 and columns 3,4,5 are selected. For binary data
 with/without compression, only the relevant elements will be read from disk. For ASCII
 data, all data will be read and then sliced. Currently, only a step size of 1 is
@@ -46,8 +46,10 @@ with kaldi_io.open_or_fd(ark_scp_output,'wb') as f:
 
 ### Reading scp in form of a list
 It is als possible to read data using an "scp" stored in a python list of the form
+```
 ["AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[:,7:13]",
   "AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[20:30,7:13]"]
+```
 
 #### Install
 - from pypi: `python -m pip --user install kaldi_io`

From 43200380a9a12a3ccb327fe07f0607a3a518cb58 Mon Sep 17 00:00:00 2001
From: Rohdin Johan A <rohdin@fit.vutbr.cz>
Date: Fri, 3 Apr 2020 18:21:42 +0200
Subject: [PATCH 5/5] Fixed README formating again

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 9d77f53..517cd09 100644
--- a/README.md
+++ b/README.md
@@ -44,8 +44,8 @@ with kaldi_io.open_or_fd(ark_scp_output,'wb') as f:
     kaldi_io.write_mat(f, mat, key=key)
 ```
 
-### Reading scp in form of a list
-It is als possible to read data using an "scp" stored in a python list of the form
+###### Reading scp in form of a list
+It is also possible to read data using an "scp" stored in a python list of the form
 ```
 ["AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[:,7:13]",
   "AMI_ES2011a_H00_FEE041_0003714_0003915_slice2 tests/data/feats.ark:14913[20:30,7:13]"]