diff --git a/.travis.yml b/.travis.yml
index 5b28cdcb9..a852039f9 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,10 +4,10 @@ sudo: required
 language: python
 
 python:
-  - 2.6
   - 2.7
-  - 3.3
   - 3.4
+  - 3.5
+  - 3.6
   - pypy
 
 env:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 46bccb542..ef14edc2e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## v0.6.4 (2019-01-31)
+- Fix bugs in `read_multiple_bytes` (thanks to @tsh56)
+- Remove end-of-life Python versions 2.6, 3.2, and 3.3. Add CI tests for 3.6
+- Expose SchemaParser in Cython header
+
 ## v0.6.3 (2018-01-14)
 - Bump bundled capnp version to v0.6.1 (thanks to @E8Yuval)
 - Fix a memleak in RemotePromise (thanks to @E8Yuval)
diff --git a/DEPLOY.md b/DEPLOY.md
new file mode 100644
index 000000000..69eae40fe
--- /dev/null
+++ b/DEPLOY.md
@@ -0,0 +1,44 @@
+# Deployment instructions for PyPi
+
+This file is meant for maintainers of pycapnp, and documents the process for uploading to PyPI.
+
+## Pre-requisites
+
+```
+pip install pypandoc cython
+```
+
+## Run tests
+
+I typically sanity check by running the tests once again locally, but as long as Travis is green, you're probably fine.
+
+## Add a commit that bumps the version
+
+Bump the version in setup.py, and add descriptions of all the changes to CHANGELOG.md (see 19e1b189caa786c7f572e679d6bb94aadfbdb5e0 for an example commit).
+
+## Run the build and upload
+
+Run the following command to clean up old artifacts, run the build, and then upload the result to PyPI
+
+```
+rm -rf bundled/ capnp/version.py capnp/lib/capnp.{h,cpp} build; python setup.py build && python setup.py sdist upload -r PyPI
+```
+
+## Test the PyPI release
+
+I manually test the PyPI release after it's been uploaded. I have a few virtualenvs that I manually run the following command in (run this from the pycapnp directory since it runs the tests at the end):
+```
+yes | pip uninstall pycapnp; pip install pycapnp && py.test test
+```
+
+I usually test the following configurations:
+- Python 2.7 with and without cython installed
+- Python 3.6 with and without cython installed
+
+This step could probably benefit greatly from some automation. Perhaps even Travis could handle it, but I'm not sure how best to trigger Travis from a PyPI release.
+
+## Tag the github release
+
+Tag the release on the develop branch (not the master branch). Sadly, I've stopped using git-flow, and at this point it might be worth moving back to using just master, but that would take some amount of work and I worry that it would break open PRs. Definitely worth considering if development picks back up.
+
+Version numbers roughly follow semver, although I try to loosely follow upstream Cap'n Proto C++ versions as well. So when pycapnp officially starts using v0.7.0 of the C++ library, pycapnp's version should be bumped to v0.7.0 as well.
diff --git a/README.md b/README.md
index e80586c47..30f8d2873 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ If you wish to install using the latest upstream C++ Cap'n Proto:
 
 ## Python Versions
 
-Python 2.6/2.7 are supported as well as Python 3.2+. PyPy 2.1+ is also supported.
+Python 2.7, Python 3.4+, and PyPy 2.1+ are supported.
 
 One oddity to note is that `Text` type fields will be treated as byte strings under Python 2, and unicode strings under Python 3. `Data` fields will always be treated as byte strings.
 
diff --git a/capnp/includes/schema_cpp.pxd b/capnp/includes/schema_cpp.pxd
index d9b691cde..9cec13dc2 100644
--- a/capnp/includes/schema_cpp.pxd
+++ b/capnp/includes/schema_cpp.pxd
@@ -786,6 +786,7 @@ cdef extern from "capnp/serialize.h" namespace " ::capnp":
     cdef cppclass FlatArrayMessageReader(MessageReader):
         FlatArrayMessageReader(WordArrayPtr array) except +reraise_kj_exception
         FlatArrayMessageReader(WordArrayPtr array, ReaderOptions) except +reraise_kj_exception
+        const word* getEnd() const
 
     void writeMessageToFd(int, MessageBuilder&) except +reraise_kj_exception
 
diff --git a/capnp/lib/capnp.pxd b/capnp/lib/capnp.pxd
index dc0ed4ea6..0882bb78a 100644
--- a/capnp/lib/capnp.pxd
+++ b/capnp/lib/capnp.pxd
@@ -12,6 +12,18 @@ cdef class _StructSchemaField:
     cdef object _parent
     cdef _init(self, C_StructSchema.Field other, parent=?)
 
+cdef class _StringArrayPtr:
+    cdef StringPtr * thisptr
+    cdef object parent
+    cdef size_t size
+    cdef ArrayPtr[StringPtr] asArrayPtr(self) except +reraise_kj_exception
+
+cdef class SchemaParser:
+    cdef C_SchemaParser * thisptr
+    cdef public dict modules_by_id
+    cdef list _all_imports
+    cdef _StringArrayPtr _last_import_array
+    cpdef _parse_disk_file(self, displayName, diskPath, imports) except +reraise_kj_exception
 
 cdef class _DynamicOrphan:
     cdef C_DynamicOrphan thisptr
diff --git a/capnp/lib/capnp.pyx b/capnp/lib/capnp.pyx
index 8b5877d4d..54a1f7db2 100644
--- a/capnp/lib/capnp.pyx
+++ b/capnp/lib/capnp.pyx
@@ -3107,10 +3107,6 @@ class _EnumModule(object):
             setattr(self, name, val)
 
 cdef class _StringArrayPtr:
-    cdef StringPtr * thisptr
-    cdef object parent
-    cdef size_t size
-
     def __cinit__(self, size_t size, parent):
         self.size = size
         self.thisptr = <StringPtr *>malloc(sizeof(StringPtr) * size)
@@ -3128,10 +3124,6 @@ cdef class SchemaParser:
 
     Do not use this class unless you're sure you know what you're doing. Use the convenience method :func:`load` instead.
     """
-    cdef C_SchemaParser * thisptr
-    cdef public dict modules_by_id
-    cdef list _all_imports
-    cdef _StringArrayPtr _last_import_array
 
     def __cinit__(self):
         self.thisptr = new C_SchemaParser()
@@ -3645,40 +3637,44 @@ cdef class _MultiplePackedMessageReader:
         return self
 
 cdef class _MultipleBytesMessageReader:
-    cdef schema_cpp.ArrayInputStream * stream
-    cdef schema_cpp.BufferedInputStream * buffered_stream
-    cdef cbool skip_copy
-
-    cdef public object traversal_limit_in_words, nesting_limit, schema, buf
+    cdef Py_ssize_t offset, sz
+    cdef const char *ptr
+    cdef object _object_to_pin
+    cdef public object traversal_limit_in_words, nesting_limit, schema
 
-    def __init__(self, buf, schema, traversal_limit_in_words = None, nesting_limit = None, skip_copy = False):
+    def __init__(self, buf, schema, traversal_limit_in_words = None, nesting_limit = None):
+        self.offset = 0
         self.schema = schema
         self.traversal_limit_in_words = traversal_limit_in_words
         self.nesting_limit = nesting_limit
-        self.skip_copy = skip_copy
 
-        cdef const void *ptr
-        cdef Py_ssize_t sz
-        PyObject_AsReadBuffer(buf, &ptr, &sz)
-
-        self.buf = buf
-        self.stream = new schema_cpp.ArrayInputStream(schema_cpp.ByteArrayPtr(<byte *>ptr, sz))
-        self.buffered_stream = new schema_cpp.BufferedInputStreamWrapper(deref(self.stream))
+        self.sz = len(buf)
+        if isinstance(buf, bytes):
+            self.ptr = buf
+            if (<uintptr_t>self.ptr) % 8 != 0:
+                aligned = _AlignedBuffer(buf)
+                self.ptr = aligned.buf
+                self._object_to_pin = aligned
+            else:
+                self._object_to_pin = buf
+                self.ptr = buf
+        elif PyObject_CheckBuffer(buf):
+            view = _BufferView(buf)
+            self.ptr = view.buf
+            self._object_to_pin = view
+        else:
+            raise TypeError('expected buffer-like object in FlatArrayMessageReader')
 
-    def __dealloc__(self):
-        del self.buffered_stream
-        del self.stream
 
     def __next__(self):
+        cdef _FlatArrayMessageReaderAligned reader
+        if self.offset == self.sz:
+            raise StopIteration
         try:
-            # FIXME:  Instead of doing a copy of the reader to advance stream pointer
-            #         we should just use the segment table to compute the message length.
-            # FIXME:  This probably suffers from the same orphan problem as `_MultipleMessageReader`
-            reader = _InputMessageReader()._init(deref(self.buffered_stream), self.traversal_limit_in_words, self.nesting_limit, self)
-            ret = reader.get_root(self.schema)
-            if not self.skip_copy:
-              ret = ret.as_builder().as_reader()
-            return ret
+            reader = _FlatArrayMessageReaderAligned()
+            reader._init(self._object_to_pin, self.ptr + self.offset, self.sz - self.offset, self.traversal_limit_in_words, self.nesting_limit)
+            self.offset += reader.msg_size
+            return reader.get_root(self.schema)
         except KjException as e:
             if 'EOF' in str(e):
                 raise StopIteration
@@ -3767,6 +3763,37 @@ cdef class _BufferView:
     def __dealloc__(self):
         PyBuffer_Release(&self.view)
 
+@cython.internal
+cdef class _FlatArrayMessageReaderAligned(_MessageReader):
+    """
+    Creates a reader based on a contiguous block of memory
+
+    For performance consideration it's assumed that the provided buffer is already aligned. This
+    allows us to align a set of adjacent messages with a single align operation.
+    """
+    cdef object _object_to_pin
+    cdef Py_ssize_t msg_size
+    def __init__(self):
+        self.msg_size = 0
+
+
+    cdef _init(self, buf, const char *ptr, Py_ssize_t sz, traversal_limit_in_words = None, nesting_limit = None):
+        cdef schema_cpp.ReaderOptions opts = make_reader_opts(traversal_limit_in_words, nesting_limit)
+        cdef schema_cpp.FlatArrayMessageReader * flat_reader
+
+        self._object_to_pin = buf
+
+        flat_reader = new schema_cpp.FlatArrayMessageReader(
+            schema_cpp.WordArrayPtr(<schema_cpp.word*>ptr, sz//8),
+            opts)
+        self.thisptr = flat_reader
+        self.msg_size = <char *>flat_reader.getEnd() - ptr
+        return self
+
+    def __dealloc__(self):
+        del self.thisptr
+
+
 @cython.internal
 cdef class _FlatArrayMessageReader(_MessageReader):
     cdef object _object_to_pin
diff --git a/setup.py b/setup.py
index a3996a7d4..afb5d016f 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@
 
 MAJOR = 0
 MINOR = 6
-MICRO = 3
+MICRO = 4
 VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO)
 
 
@@ -49,6 +49,8 @@ def write_version_py(filename=None):
     changelog = '\nChangelog\n=============\n' + changelog
     long_description += changelog
 except (IOError, ImportError):
+    if sys.argv[2] == 'sdist':
+        raise
     long_description = ''
 
 # Clean command, invoked with `python setup.py clean`
@@ -171,11 +173,11 @@ def run(self):
         'Programming Language :: C++',
         'Programming Language :: Cython',
         'Programming Language :: Python :: 2',
-        'Programming Language :: Python :: 2.6',
         'Programming Language :: Python :: 2.7',
         'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.2',
-        'Programming Language :: Python :: 3.3',
+        'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: Implementation :: PyPy',
         'Topic :: Communications'],
 )
diff --git a/test/test_large_read.py b/test/test_large_read.py
index c925d1f48..e9120085c 100644
--- a/test/test_large_read.py
+++ b/test/test_large_read.py
@@ -1,4 +1,5 @@
 import pytest
+import platform
 import capnp
 import os
 import tempfile
@@ -40,3 +41,59 @@ def test_large_read_multiple(test_capnp):
 
     for m in test_capnp.Msg.read_multiple(f):
         pass
+
+def get_two_adjacent_messages(test_capnp):
+    msg1 = test_capnp.Msg.new_message()
+    msg1.data = [0x41] * 8192
+    m1 = msg1.to_bytes()
+    msg2 = test_capnp.Msg.new_message()
+    m2 = msg2.to_bytes()
+
+    return  m1 + m2
+
+def test_large_read_multiple_bytes(test_capnp):
+    data = get_two_adjacent_messages(test_capnp)
+    for m in test_capnp.Msg.read_multiple_bytes(data):
+        pass
+
+    with pytest.raises(capnp.KjException):
+        data = get_two_adjacent_messages(test_capnp)[:-1]
+        for m in test_capnp.Msg.read_multiple_bytes(data):
+            pass
+
+    with pytest.raises(capnp.KjException):
+        data = get_two_adjacent_messages(test_capnp) + b' '
+        for m in test_capnp.Msg.read_multiple_bytes(data):
+            pass
+
+@pytest.mark.skipif(platform.python_implementation() == 'PyPy', reason="PyPy memoryview support is limited")
+def test_large_read_mutltiple_bytes_memoryview(test_capnp):
+    data = get_two_adjacent_messages(test_capnp)
+    for m in test_capnp.Msg.read_multiple_bytes(memoryview(data)):
+        pass
+
+    with pytest.raises(capnp.KjException):
+        data = get_two_adjacent_messages(test_capnp)[:-1]
+        for m in test_capnp.Msg.read_multiple_bytes(memoryview(data)):
+            pass
+
+    with pytest.raises(capnp.KjException):
+        data = get_two_adjacent_messages(test_capnp) + b' '
+        for m in test_capnp.Msg.read_multiple_bytes(memoryview(data)):
+            pass
+
+@pytest.mark.skipif(sys.version_info[0] == 3, reason="Legacy buffer support only for python 2.7")
+def test_large_read_mutltiple_bytes_buffer(test_capnp):
+    data = get_two_adjacent_messages(test_capnp)
+    for m in test_capnp.Msg.read_multiple_bytes(buffer(data)):
+        pass
+
+    with pytest.raises(capnp.KjException):
+        data = get_two_adjacent_messages(test_capnp)[:-1]
+        for m in test_capnp.Msg.read_multiple_bytes(buffer(data)):
+            pass
+
+    with pytest.raises(capnp.KjException):
+        data = get_two_adjacent_messages(test_capnp) + b' '
+        for m in test_capnp.Msg.read_multiple_bytes(buffer(data)):
+            pass
diff --git a/tox.ini b/tox.ini
index e41316bab..75af01611 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = py27,py32,py33,py34
+envlist = py27,py34,py35,py36
 
 [testenv]
 deps=