Redesign MemPool to fix memory access alignment bug and allocate all bug

The alignment problem occured only on CPUs not supporting
unaligned access (e.g. some Arm CPUs) when indefinite length
strings were being decoded. It should up as a crash when
running MemPoolTest, AllocAllStringsTest, IndefiniteLengthStringTest
and IndefiniteLengthNestTest.

The fix changes the interface for QCBORDecode_SetUpAllocator()
and simplifies some of the way the string allocators work.

There is also a fix when allocate all strings is turned on.
Decoded items weren't correctly marked as allocated.
diff --git a/Makefile b/Makefile
index cba2c40..18dacc6 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-#  Copyright (c) 2018, Laurence Lundblade.
+#  Copyright (c) 2018-2019, Laurence Lundblade.
 #  All rights reserved.
 #  
 # Redistribution and use in source and binary forms, with or without
@@ -26,7 +26,7 @@
 # OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
 # IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-CFLAGS=-I inc -I test -Os -Wall -Werror -pedantic-errors -Wextra -Wshadow -Wparentheses -xc -std=c99
+CFLAGS=-I inc -I test -Os -Wcast-align -Wall -Werror -pedantic-errors -Wextra -Wshadow -Wparentheses -xc -std=c99
 
 QCBOR_OBJ=src/UsefulBuf.o src/qcbor_encode.o src/qcbor_decode.o src/ieee754.o 
 
diff --git a/Makefile.gcc8 b/Makefile.gcc8
index 799ff3b..8f46ce7 100644
--- a/Makefile.gcc8
+++ b/Makefile.gcc8
@@ -1,4 +1,4 @@
-#  Copyright (c) 2018, Laurence Lundblade.
+#  Copyright (c) 2018-2019, Laurence Lundblade.
 #  All rights reserved.
 #  
 # Redistribution and use in source and binary forms, with or without
@@ -26,7 +26,7 @@
 # OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
 # IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-CFLAGS=-I inc -I test -Os -Wall -Werror -pedantic-errors -Wextra -Wshadow -Wparentheses -xc -std=c99 -Werror=maybe-uninitialized
+CFLAGS=-I inc -I test -O3 -Wcast-align -Wall -Werror -pedantic-errors -Wextra -Wshadow -Wparentheses -xc -std=c99 -Werror=maybe-uninitialized
 CC=/usr/local/bin/gcc-8
 
 QCBOR_OBJ=src/UsefulBuf.o src/qcbor_encode.o src/qcbor_decode.o src/ieee754.o 
diff --git a/README.md b/README.md
index 325d9a3..8b22b8a 100644
--- a/README.md
+++ b/README.md
@@ -14,8 +14,8 @@
   representations and decoding to native C representations is supported.
 
 **Small simple memory model** – Malloc is not needed. The encode
-  context is 136 bytes, decode context is 104 bytes and the
-  description of decoded data item is 56 bytes. Stack use is light and
+  context is 136 bytes, decode context is 144 bytes and the
+  description of decoded data item is 48 bytes. Stack use is light and
   there is no recursion. The caller supplies the memory to hold the
   encoded CBOR and encode/decode contexts so caller has full control
   of memory usage making it good for embedded implementations that
@@ -64,6 +64,10 @@
 changes are not planned or expected, particularly in the
 interface. The test coverage is pretty good.
 
+On Feb 18 2019, there was an interface change to QCBORDecode_SetUpAllocator
+for external string allocator set up. The change was part of a
+memory access alignment fix and code simplification.
+
 ## Building
 
 There is a simple makefile for the UNIX style command line binary that
diff --git a/inc/qcbor.h b/inc/qcbor.h
index 3011cc6..a7e3598 100644
--- a/inc/qcbor.h
+++ b/inc/qcbor.h
@@ -43,6 +43,7 @@
 
  when               who             what, where, why
  --------           ----            ---------------------------------------------------
+ 02/16/19           llundblade      Redesign MemPool to fix memory access alignment bug
  12/18/18           llundblade      Move decode malloc optional code to separate repository
  12/13/18           llundblade      Documentatation improvements
  11/29/18           llundblade      Rework to simpler handling of tags and labels.
@@ -163,6 +164,13 @@
 } QCBORDecodeNesting;
 
 
+typedef struct  {
+   // PRIVATE DATA STRUCTURE
+   void *pAllocateCxt;
+   UsefulBuf (* pfAllocator)(void *pAllocateCxt, void *pOldMem, size_t uNewSize);
+} QCORInternalAllocator;
+
+
 /*
  PRIVATE DATA STRUCTURE
 
@@ -170,8 +178,8 @@
  functions form an "object" that does CBOR decoding.
 
  Size approximation (varies with CPU/compiler):
-   64-bit machine: 32 + 1 + 1 + 6 bytes padding + 72 + 16 = 128 bytes
-   32-bit machine: 16 + 1 + 1 + 2 bytes padding + 68 + 8  = 68 bytes
+   64-bit machine: 32 + 1 + 1 + 6 bytes padding + 72 + 16 + 8 + 8 = 144 bytes
+   32-bit machine: 16 + 1 + 1 + 2 bytes padding + 68 +  8 + 8 + 4 = 108 bytes
  */
 struct _QCBORDecodeContext {
    // PRIVATE DATA STRUCTURE
@@ -182,11 +190,16 @@
 
    QCBORDecodeNesting nesting;
 
-   // This is NULL or points to a QCBORStringAllocator. It is void
-   // here because _QCBORDecodeContext is defined early in the
-   // private part of this file and QCBORStringAllocat is defined
-   // later in the public part of this file.
-   void *pStringAllocator;
+   // If a string allocator is configured for indefinite length
+   // strings, it is configured here.
+   QCORInternalAllocator StringAllocator;
+
+   // These are special for the internal MemPool allocator.
+   // They are not used otherwise. We tried packing these
+   // in the MemPool itself, but there are issues
+   // with memory alignment.
+   uint32_t uMemPoolSize;
+   uint32_t uMemPoolFreeOffset;
 
    // This is NULL or points to QCBORTagList.
    // It is type void for the same reason as above.
@@ -758,45 +771,78 @@
 } QCBORItem;
 
 
+
 /**
- This is a set of functions and pointer context (in object-oriented parlance,
- an "object") used to allocate memory for coalescing the segments of an indefinite
- length string into one.
+  \brief The type defining what a string allocator function must do.
 
- The fAllocate function works as an initial allocator and a reallocator to
- expand the string for each new segment. When it is an initial allocator
- pOldMem is NULL.
+ * \param[in] pAllocateCxt  Pointer to context for the particular
+                             allocator implementation What is in the
+                             context is dependent on how a particular
+                             string allocator works. Typically, it
+                             will contain a pointer to the memory pool
+                             and some booking keeping data.
+ \param[in] pOldMem          Points to some memory allocated by the
+                             allocator that is either to be freed or
+                             to be reallocated to be larger. It is
+                             NULL for new allocations and when call as
+                             a destructor to clean up the whole
+                             allocation.
+ \param[in] uNewSize         Size of memory to be allocated or new
+                             size of chunk to be reallocated. Zero for
+                             a new allocation or when called as a
+                             destructor.
 
- The fFree function is called to clean up an individual allocation when an error occurs.
+ \return   Either the allocated buffer is returned, or \c
+           NULLUsefulBufC. \c NULLUsefulBufC is returned on a failed
+           allocation or in the two cases where there is nothing to
+           return.
 
- The fDesctructor function is called when QCBORDecode_Finish is called.
+ This is called in one of four modes:
 
- Any memory allocated with this will be marked by setting uDataAlloc
- or uLabelAlloc in the QCBORItem structure so the caller knows they
- have to free it.
+ Allocate -- \c uNewSize is the amount to allocate. \c pOldMem is \c
+ NULL.
 
- fAllocate is only ever called to increase the single most recent
- allocation made, making implementation of a memory pool very simple.
+ Free -- \c uNewSize is 0. \c pOldMem points to the memory to be
+ freed.  When the decoder calls this, it will always be the most
+ recent block that was either allocated or reallocated.
 
- fFree is also only called on the single most recent allocation.
+ Reallocate -- \c pOldMem is the block to reallocate. \c uNewSize is
+ its new size.  When the decoder calls this, it will always be the
+ most recent block that was either allocated or reallocated.
+
+ Destruct -- \c pOldMem is NULL and \c uNewSize is 0. This is called
+ when the decoding is complete by QCBORDecode_Finish(). Usually the
+ strings allocated by a string allocator are in use after the decoding
+ is completed so this usually will not free those strings. Many string
+ allocators will not need to do anything in this mode.
+
+ The strings allocated by this will have \c uDataAlloc set to true in
+ the \ref QCBORItem when they are returned. The user of the strings
+ will have to free them. How they free them, depends on the string
+ allocator.
+
+ If QCBORDecode_SetMemPool() is called, the internal MemPool will be
+ used. It has it's own internal implementation of this function, so
+ one does not need to be implemented.
+
  */
-typedef struct {
-   void       *pAllocaterContext;
-   UsefulBuf (*fAllocate)(void *pAllocaterContext, void *pOldMem, size_t uNewSize);
-   void      (*fFree)(void *pAllocaterContext, void *pMem);
-   void      (*fDestructor)(void *pAllocaterContext);
-} QCBORStringAllocator;
+typedef UsefulBuf (* QCBORStringAllocate)(void *pAllocateCxt, void *pOldMem, size_t uNewSize);
 
 
 /**
- This only matters if you use a string allocator
- and and set it up with QCBORDecode_SetMemPool(). It is
+ This only matters if you use the built-in string allocator
+ by settig it up with QCBORDecode_SetMemPool(). This is
  the size of the overhead needed needed by
- QCBORDecode_SetMemPool().  If you write your own
+ QCBORDecode_SetMemPool(). The amount of memory
+ available for decoded strings will be the
+ size of the buffer given to QCBORDecode_SetMemPool() less
+ this amount.
+
+ If you write your own
  string allocator or use the separately available malloc
  based string allocator, this size will not apply
  */
-#define QCBOR_DECODE_MIN_MEM_POOL_SIZE 72
+#define QCBOR_DECODE_MIN_MEM_POOL_SIZE 8
 
 
 /**
@@ -1664,11 +1710,14 @@
 /**
  @brief Set up the MemPool string allocator for indefinite length strings.
 
- @param[in] pCtx The decode context.
- @param[in] MemPool The pointer and length of the memory pool.
- @param[in] bAllStrings true means to put even definite length strings in the pool.
+ @param[in] pCtx         The decode context.
+ @param[in] MemPool      The pointer and length of the memory pool.
 
- @return error if the MemPool was less than QCBOR_DECODE_MIN_MEM_POOL_SIZE.
+ @param[in] bAllStrings  If true, all strings, even of definite
+                         length, will be allocated with the string
+                         allocator.
+
+ @return Error if the MemPool was less than \ref QCBOR_DECODE_MIN_MEM_POOL_SIZE.
 
  Indefinite length strings (text and byte) cannot be decoded unless
  there is a string allocator configured. MemPool is a simple built-in
@@ -1677,25 +1726,28 @@
  length for some block of memory that is to be used for string
  allocation. It can come from the stack, heap or other.
 
- The memory pool must be QCBOR_DECODE_MIN_MEM_POOL_SIZE plus space for
- all the strings allocated.  There is no overhead per string allocated
+ The memory pool must be \ref QCBOR_DECODE_MIN_MEM_POOL_SIZE plus
+ space for all the strings allocated.  There is no overhead per string
+ allocated. A conservative way to size this buffer is to make it the
+ same size as the CBOR being decoded plus \ref
+ QCBOR_DECODE_MIN_MEM_POOL_SIZE.
 
  This memory pool is used for all indefinite length strings that are
  text strings or byte strings, including strings used as labels.
 
- The pointers to strings in QCBORItem will point into the memory pool set
- here. They do not need to be individually freed. Just discard the buffer
- when they are no longer needed.
+ The pointers to strings in \ref QCBORItem will point into the memory
+ pool set here. They do not need to be individually freed. Just
+ discard the buffer when they are no longer needed.
 
- If bAllStrings is set, then the size will be the overhead plus the
+ If \c bAllStrings is set, then the size will be the overhead plus the
  space to hold **all** strings, definite and indefinite length, value
  or label. The advantage of this is that after the decode is complete,
  the original memory holding the encoded CBOR does not need to remain
  valid.
 
  If this function is never called because there is no need to support
- indefinite length strings, the MemPool implementation should be
- dead-stripped by the loader and not add to code size.
+ indefinite length strings, the internal MemPool implementation should
+ be dead-stripped by the loader and not add to code size.
  */
 QCBORError QCBORDecode_SetMemPool(QCBORDecodeContext *pCtx, UsefulBuf MemPool, bool bAllStrings);
 
@@ -1703,23 +1755,44 @@
 /**
  @brief Sets up a custom string allocator for indefinite length strings
 
- @param[in] pCtx The decoder context to set up an allocator for
- @param[in] pAllocator The string allocator "object"
+ @param[in] pCtx                 The decoder context to set up an
+                                 allocator for.
+ @param[in] pfAllocateFunction   Pointer to function that will be
+                                 called by QCBOR for allocations and
+                                 frees.
+ @param[in] pAllocateContext     Context passed to \c
+                                 pfAllocateFunction.
+ @param[in] bAllStrings          If true, all strings, even of definite
+                                 length, will be allocated with the
+                                 string allocator.
 
- See QCBORStringAllocator for the requirements of the string allocator.
+ Indefinite length strings (text and byte) cannot be decoded unless
+ there a string allocator is configured. QCBORDecode_SetUpAllocator()
+ allows the caller to configure an external string allocator
+ implementation if the internal string allocator is not suitable. See
+ QCBORDecode_SetMemPool() to configure the internal allocator. Note
+ that the internal allocator is not automatically set up.
 
- Typically, this is used if the simple MemPool allocator isn't desired.
+ The string allocator configured here can be a custom one designed and
+ implemented by the caller.  See \ref QCBORStringAllocate for the
+ requirements for a string allocator implementation.
 
- A malloc based string allocator can be obtained by calling
- QCBOR_DMalloc(). This function is supply separately from qcbor
- to keep qcbor smaller and neater. It is in a separate
- GitHub repository.
+ A malloc-based string external allocator can be obtained by calling
+ \c QCBORDecode_MakeMallocStringAllocator(). It will return a function
+ and pointer that can be given here as \c pAllocatorFunction and \c
+ pAllocatorContext. It uses standard \c malloc() so \c free() must be
+ called onall strings marked by \c uDataAlloc \c == \c 1 or \c
+ uLabelAlloc \c == \c 1 in \ref QCBORItem.
 
- You can also write your own allocator. Create the allocate, free,
- and destroy functions and put pointers to them in a QCBORStringAllocator.
+ Note that an older version of this function took an allocator
+ structure, rather than single function and pointer.  The older
+ version \c QCBORDecode_MakeMallocStringAllocator() also implemented
+ the older interface.
  */
-void QCBORDecode_SetUpAllocator(QCBORDecodeContext *pCtx, const QCBORStringAllocator *pAllocator, bool bAllStrings);
-
+void QCBORDecode_SetUpAllocator(QCBORDecodeContext *pCtx,
+                                QCBORStringAllocate pfAllocateFunction,
+                                void *pAllocateContext,
+                                bool bAllStrings);
 
 /**
  @brief Configure list of caller selected tags to be recognized
diff --git a/src/qcbor_decode.c b/src/qcbor_decode.c
index abd03e3..079e77d 100644
--- a/src/qcbor_decode.c
+++ b/src/qcbor_decode.c
@@ -42,6 +42,8 @@
 
  when               who             what, where, why
  --------           ----            ---------------------------------------------------
+ 02/17/19           llundblade      Fixed: QCBORItem.u{Data|Label}Alloc when bAllStrings set
+ 02/16/19           llundblade      Redesign MemPool to fix memory access alignment bug
  01/10/19           llundblade      Clever type and argument decoder is 250 bytes smaller
  11/9/18            llundblade      Error codes are now enums.
  11/2/18            llundblade      Simplify float decoding and align with preferred
@@ -309,6 +311,39 @@
 
 
 
+/* ===========================================================================
+   QCBORStringAllocate -- STRING ALLOCATOR INVOCATION
+
+   The following four functions are pretty wrappers for invocation of
+   the string allocator supplied by the caller.
+
+ ==============================================================================*/
+
+static inline void StringAllocator_Free(const QCORInternalAllocator *pMe, void *pMem)
+{
+   (pMe->pfAllocator)(pMe->pAllocateCxt, pMem, 0);
+}
+
+// StringAllocator_Reallocate called with pMem NULL is equal to StringAllocator_Allocate()
+static inline UsefulBuf StringAllocator_Reallocate(const QCORInternalAllocator *pMe, void *pMem, size_t uSize)
+{
+   return (pMe->pfAllocator)(pMe->pAllocateCxt, pMem, uSize);
+}
+
+static inline UsefulBuf StringAllocator_Allocate(const QCORInternalAllocator *pMe, size_t uSize)
+{
+   return (pMe->pfAllocator)(pMe->pAllocateCxt, NULL, uSize);
+}
+
+static inline void StringAllocator_Destruct(const QCORInternalAllocator *pMe)
+{
+   if(pMe->pfAllocator) {
+      (pMe->pfAllocator)(pMe->pAllocateCxt, NULL, 0);
+   }
+}
+
+
+
 
 /*
  Public function, see header file
@@ -327,12 +362,20 @@
 /*
  Public function, see header file
  */
-void QCBORDecode_SetUpAllocator(QCBORDecodeContext *pCtx, const QCBORStringAllocator *pAllocator, bool bAllocAll)
+void QCBORDecode_SetUpAllocator(QCBORDecodeContext *pMe,
+                                QCBORStringAllocate pfAllocateFunction,
+                                void *pAllocateContext,
+                                bool bAllStrings)
 {
-   pCtx->pStringAllocator = (void *)pAllocator;
-   pCtx->bStringAllocateAll = bAllocAll;
+   pMe->StringAllocator.pfAllocator   = pfAllocateFunction;
+   pMe->StringAllocator.pAllocateCxt  = pAllocateContext;
+   pMe->bStringAllocateAll            = bAllStrings;
 }
 
+
+/*
+ Public function, see header file
+ */
 void QCBORDecode_SetCallerConfiguredTagList(QCBORDecodeContext *me, const QCBORTagListIn *pTagList)
 {
    me->pCallerConfiguredTagList = pTagList;
@@ -360,14 +403,14 @@
                                               uint8_t *puAdditionalInfo)
 {
    QCBORError nReturn;
-   
+
    // Get the initial byte that every CBOR data item has
    const uint8_t uInitialByte = UsefulInputBuf_GetByte(pUInBuf);
-   
+
    // Break down the initial byte
    const uint8_t uTmpMajorType   = uInitialByte >> 5;
    const uint8_t uAdditionalInfo = uInitialByte & 0x1f;
-   
+
    // Where the number or argument accumulates
    uint64_t uArgument;
 
@@ -391,18 +434,18 @@
       // No more bytes to get
       uArgument = uAdditionalInfo;
    }
-   
+
    if(UsefulInputBuf_GetError(pUInBuf)) {
       nReturn = QCBOR_ERR_HIT_END;
       goto Done;
    }
-   
+
    // All successful if we got here.
    nReturn           = QCBOR_SUCCESS;
    *pnMajorType      = uTmpMajorType;
    *puArgument       = uArgument;
    *puAdditionalInfo = uAdditionalInfo;
-   
+
 Done:
    return nReturn;
 }
@@ -545,7 +588,11 @@
 /*
  Decode text and byte strings. Call the string allocator if asked to.
  */
-inline static QCBORError DecodeBytes(const QCBORStringAllocator *pAlloc, int nMajorType, uint64_t uStrLen, UsefulInputBuf *pUInBuf, QCBORItem *pDecodedItem)
+inline static QCBORError DecodeBytes(const QCORInternalAllocator *pAllocator,
+                                     int nMajorType,
+                                     uint64_t uStrLen,
+                                     UsefulInputBuf *pUInBuf,
+                                     QCBORItem *pDecodedItem)
 {
    // Stack usage: UsefulBuf 2, int/ptr 1  40
    QCBORError nReturn = QCBOR_SUCCESS;
@@ -557,14 +604,15 @@
       goto Done;
    }
 
-   if(pAlloc) {
+   if(pAllocator) {
       // We are asked to use string allocator to make a copy
-      UsefulBuf NewMem = pAlloc->fAllocate(pAlloc->pAllocaterContext, NULL, uStrLen);
+      UsefulBuf NewMem = StringAllocator_Allocate(pAllocator, uStrLen);
       if(UsefulBuf_IsNULL(NewMem)) {
          nReturn = QCBOR_ERR_STRING_ALLOCATE;
          goto Done;
       }
       pDecodedItem->val.string = UsefulBuf_Copy(NewMem, Bytes);
+      pDecodedItem->uDataAlloc = 1;
    } else {
       // Normal case with no string allocator
       pDecodedItem->val.string = Bytes;
@@ -674,7 +722,9 @@
  Errors detected here include: an array that is too long to decode, hit end of buffer unexpectedly,
     a few forms of invalid encoded CBOR
  */
-static QCBORError GetNext_Item(UsefulInputBuf *pUInBuf, QCBORItem *pDecodedItem, const QCBORStringAllocator *pAlloc)
+static QCBORError GetNext_Item(UsefulInputBuf *pUInBuf,
+                               QCBORItem *pDecodedItem,
+                               const QCORInternalAllocator *pAllocator)
 {
    // Stack usage: int/ptr 3 -- 24
    QCBORError nReturn;
@@ -708,7 +758,7 @@
             pDecodedItem->uDataType  = (uMajorType == CBOR_MAJOR_TYPE_BYTE_STRING) ? QCBOR_TYPE_BYTE_STRING : QCBOR_TYPE_TEXT_STRING;
             pDecodedItem->val.string = (UsefulBufC){NULL, SIZE_MAX};
          } else {
-            nReturn = DecodeBytes(pAlloc, uMajorType, uNumber, pUInBuf, pDecodedItem);
+            nReturn = DecodeBytes(pAllocator, uMajorType, uNumber, pUInBuf, pDecodedItem);
          }
          break;
 
@@ -758,10 +808,14 @@
 {
    // Stack usage; int/ptr 2 UsefulBuf 2 QCBORItem  -- 96
    QCBORError nReturn;
-   QCBORStringAllocator *pAlloc = (QCBORStringAllocator *)me->pStringAllocator;
+   const QCORInternalAllocator *pAllocator = me->StringAllocator.pfAllocator ?
+                                                      &(me->StringAllocator) :
+                                                      NULL;
    UsefulBufC FullString = NULLUsefulBufC;
 
-   nReturn = GetNext_Item(&(me->InBuf), pDecodedItem, me->bStringAllocateAll ? pAlloc: NULL);
+   nReturn = GetNext_Item(&(me->InBuf),
+                          pDecodedItem,
+                          me->bStringAllocateAll ? pAllocator: NULL);
    if(nReturn) {
       goto Done;
    }
@@ -781,7 +835,7 @@
    }
 
    // Can't do indefinite length strings without a string allocator
-   if(pAlloc == NULL) {
+   if(pAllocator == NULL) {
       nReturn = QCBOR_ERR_NO_STRING_ALLOCATOR;
       goto Done;
    }
@@ -795,7 +849,7 @@
       // Get item for next chunk
       QCBORItem StringChunkItem;
       // NULL passed to never string alloc chunk of indefinite length strings
-      nReturn = GetNext_Item(&(me->InBuf), &StringChunkItem, NULL);
+      nReturn = GetNext_Item(&(me->InBuf), &StringChunkItem, pAllocator);
       if(nReturn) {
          break;  // Error getting the next chunk
       }
@@ -816,9 +870,12 @@
       }
 
       // Alloc new buffer or expand previously allocated buffer so it can fit
-      UsefulBuf NewMem = (*pAlloc->fAllocate)(pAlloc->pAllocaterContext,
-                                              UNCONST_POINTER(FullString.ptr),
-                                              FullString.len + StringChunkItem.val.string.len);
+      // The first time throurgh FullString.ptr is NULL and this is
+      // equivalent to StringAllocator_Allocate()
+      UsefulBuf NewMem = StringAllocator_Reallocate(pAllocator,
+                                                    UNCONST_POINTER(FullString.ptr),
+                                                    FullString.len + StringChunkItem.val.string.len);
+
       if(UsefulBuf_IsNULL(NewMem)) {
          // Allocation of memory for the string failed
          nReturn = QCBOR_ERR_STRING_ALLOCATE;
@@ -830,9 +887,9 @@
    }
 
 Done:
-   if(pAlloc && nReturn && !UsefulBuf_IsNULLC(FullString)) {
-      // Getting item failed, clean up the allocated memory
-      (pAlloc->fFree)(pAlloc->pAllocaterContext, UNCONST_POINTER(FullString.ptr));
+   if(nReturn != QCBOR_SUCCESS && !UsefulBuf_IsNULLC(FullString)) {
+      // Getting the item failed, clean up the allocated memory
+      StringAllocator_Free(pAllocator, UNCONST_POINTER(FullString.ptr));
    }
 
    return nReturn;
@@ -1158,12 +1215,7 @@
 Done:
    // Call the destructor for the string allocator if there is one.
    // Always called, even if there are errors; always have to clean up
-   if(me->pStringAllocator) {
-      QCBORStringAllocator *pAllocator = (QCBORStringAllocator *)me->pStringAllocator;
-      if(pAllocator->fDestructor) {
-         (pAllocator->fDestructor)(pAllocator->pAllocaterContext);
-      }
-   }
+   StringAllocator_Destruct(&(me->StringAllocator));
 
    return nReturn;
 }
@@ -1197,123 +1249,176 @@
 
 
 
-/*
- This is a very primitive memory allocator. It does not track individual
- allocations, only a high-water mark. A free or reallotcation must be of
- the last chunk allocated.
+/* ===========================================================================
+   MemPool -- BUILT-IN SIMPLE STRING ALLOCATOR
 
- All of this following code will get dead-stripped if QCBORDecode_SetMemPool()
- is not called.
- */
+   This implements a simple sting allocator for indefinite length
+   strings that can be enabled by calling QCBORDecode_SetMemPool(). It
+   implements the function type QCBORStringAllocate and allows easy
+   use of it.
 
-typedef struct {
-   QCBORStringAllocator  StringAllocator;
-   uint8_t              *pStart;  // First byte that can be allocated
-   uint8_t              *pEnd;    // One past the last byte that can be allocated
-   uint8_t              *pFree;   // Where the next free chunk is
-} MemPool;
+   This particular allocator is built-in for convenience. The caller
+   can implement their own.  All of this following code will get
+   dead-stripped if QCBORDecode_SetMemPool() is not called.
+
+   This is a very primitive memory allocator. It does not track
+   individual allocations, only a high-water mark. A free or
+   reallocation must be of the last chunk allocated.
+
+   The size of the pool and offset to free memory are packed into the
+   first 8 bytes of the memory pool so we don't have to keep them in
+   the decode context. Since the address of the pool may not be
+   aligned, they have to be packed and unpacked as if they were
+   serialized data of the wire or such.
+
+   The sizes packed in are uint32_t to be the same on all CPU types
+   and simplify the code.
+   =========================================================================== */
+
+
+static inline int MemPool_Unpack(const void *pMem, uint32_t *puPoolSize, uint32_t *puFreeOffset)
+{
+   // Use of UsefulInputBuf is overkill, but it is convenient.
+   UsefulInputBuf UIB;
+
+   // Just assume the size here. It was checked during SetUp so the assumption is safe.
+   UsefulInputBuf_Init(&UIB, (UsefulBufC){pMem, QCBOR_DECODE_MIN_MEM_POOL_SIZE});
+   *puPoolSize     = UsefulInputBuf_GetUint32(&UIB);
+   *puFreeOffset   = UsefulInputBuf_GetUint32(&UIB);
+   return UsefulInputBuf_GetError(&UIB);
+}
+
+
+static inline int MemPool_Pack(UsefulBuf Pool, uint32_t uFreeOffset)
+{
+   // Use of UsefulOutBuf is overkill, but convenient. The
+   // length check performed here is useful.
+   UsefulOutBuf UOB;
+
+   UsefulOutBuf_Init(&UOB, Pool);
+   UsefulOutBuf_AppendUint32(&UOB, (uint32_t)Pool.len); // size of pool
+   UsefulOutBuf_AppendUint32(&UOB, uFreeOffset); // first free position
+   return UsefulOutBuf_GetError(&UOB);
+}
 
 
 /*
- Internal function for an allocation
+ Internal function for an allocation, reallocation free and destuct.
+
+ Having only one function rather than one each per mode saves space in
+ QCBORDecodeContext.
 
  Code Reviewers: THIS FUNCTION DOES POINTER MATH
  */
-static UsefulBuf MemPool_Alloc(void *ctx, void *pMem, size_t uNewSize)
+static UsefulBuf MemPool_Function(void *pPool, void *pMem, size_t uNewSize)
 {
-   MemPool *me      = (MemPool *)ctx;
-   void    *pReturn = NULL;
+   UsefulBuf ReturnValue = NULLUsefulBuf;
 
-   if(pMem) {
-      // Realloc case
-      // This check will work even if uNewSize is a super-large value like UINT64_MAX
-      if((uNewSize <= (size_t)(me->pEnd - (uint8_t *)pMem)) && ((uint8_t *)pMem >= me->pStart)) {
-         me->pFree = (uint8_t *)pMem + uNewSize;
-         pReturn   = pMem;
+   uint32_t uPoolSize;
+   uint32_t uFreeOffset;
+
+   if(uNewSize > UINT32_MAX) {
+      // This allocator is only good up to 4GB.  This check should
+      // optimize out if sizeof(size_t) == sizeof(uint32_t)
+      goto Done;
+   }
+   const uint32_t uNewSize32 = (uint32_t)uNewSize;
+
+   if(MemPool_Unpack(pPool, &uPoolSize, &uFreeOffset)) {
+      goto Done;
+   }
+
+   if(uNewSize) {
+      if(pMem) {
+         // REALLOCATION MODE
+         // Calculate pointer to the end of the memory pool.  It is
+         // assumed that pPool + uPoolSize won't wrap around by
+         // assuming the caller won't pass a pool buffer in that is
+         // not in legitimate memory space.
+         const void *pPoolEnd = (uint8_t *)pPool + uPoolSize;
+
+         // Check that the pointer for reallocation is in the range of the
+         // pool. This also makes sure that pointer math further down
+         // doesn't wrap under or over.
+         if(pMem >= pPool && pMem < pPoolEnd) {
+            // Offset to start of chunk for reallocation. This won't
+            // wrap under because of check that pMem >= pPool.  Cast
+            // is safe because the pool is always less than UINT32_MAX
+            // because of check in QCBORDecode_SetMemPool().
+            const uint32_t uMemOffset = (uint32_t)((uint8_t *)pMem - (uint8_t *)pPool);
+
+            // Check to see if the allocation will fit. uPoolSize -
+            // uMemOffset will not wrap under because of check that
+            // pMem is in the range of the uPoolSize by check above.
+            if(uNewSize <= uPoolSize - uMemOffset) {
+               ReturnValue.ptr = pMem;
+               ReturnValue.len = uNewSize;
+
+               // Addition won't wrap around over because uNewSize was
+               // checked to be sure it is less than the pool size.
+               uFreeOffset = uMemOffset + uNewSize32;
+            }
+         }
+      } else {
+         // ALLOCATION MODE
+         // uPoolSize - uFreeOffset will not underflow because this
+         // pool implementation makes sure uFreeOffset is always
+         // smaller than uPoolSize through this check here and
+         // reallocation case.
+         if(uNewSize <= uPoolSize - uFreeOffset) {
+            ReturnValue.len = uNewSize;
+            ReturnValue.ptr = (uint8_t *)pPool + uFreeOffset;
+            uFreeOffset    += uNewSize;
+         }
       }
    } else {
-      // New chunk case
-      // This check will work even if uNewSize is a super large value like UINT64_MAX
-      if(uNewSize <= (size_t)(me->pEnd - me->pFree)) {
-         pReturn    = me->pFree;
-         me->pFree += uNewSize;
+      if(pMem) {
+         // FREE MODE
+         // Cast is safe because of limit on pool size in
+         // QCBORDecode_SetMemPool()
+         uFreeOffset = (uint32_t)((uint8_t *)pMem - (uint8_t *)pPool);
+      } else {
+         // DESTRUCT MODE
+         // Nothing to do for this allocator
       }
    }
 
-   return (UsefulBuf){pReturn, uNewSize};
+   UsefulBuf Pool = {pPool, uPoolSize};
+   MemPool_Pack(Pool, uFreeOffset);
+
+Done:
+   return ReturnValue;
 }
 
-/*
- Internal function to free memory
- */
-static void MemPool_Free(void *ctx, void *pOldMem)
-{
-   MemPool *me = (MemPool *)ctx;
-   me->pFree   = pOldMem;
-}
 
 /*
  Public function, see header qcbor.h file
  */
-QCBORError QCBORDecode_SetMemPool(QCBORDecodeContext *me, UsefulBuf Pool, bool bAllStrings)
+QCBORError QCBORDecode_SetMemPool(QCBORDecodeContext *pMe, UsefulBuf Pool, bool bAllStrings)
 {
-   // The idea behind QCBOR_MIN_MEM_POOL_SIZE is
-   // that the caller knows exactly what size to
-   // allocate and that the tests can run conclusively
-   // no matter what size MemPool is
-   // even though it wastes some memory. MemPool
-   // will vary depending on pointer size of the
-   // the machine. QCBOR_MIN_MEM_POOL_SIZE is
-   // set for pointers up to 64-bits. This
-   // wastes about 50 bytes on a 32-bit machine.
-   // This check makes sure things don't go
-   // horribly wrong. It should optimize out
-   // when there is no problem as the sizes are
-   // known at compile time.
-   if(sizeof(MemPool) > QCBOR_DECODE_MIN_MEM_POOL_SIZE) {
-      return QCBOR_ERR_MEM_POOL_INTERNAL;
-   }
-
-   // The first bytes of the Pool passed in are used
-   // as the context (vtable of sorts) for the memory pool
-   // allocator.
-   if(Pool.len < QCBOR_DECODE_MIN_MEM_POOL_SIZE) {
+   // The pool size and free mem offset are packed into the beginning
+   // of the pool memory. This compile time check make sure the
+   // constant in the header is correct.  This check should optimize
+   // down to nothing.
+   if(QCBOR_DECODE_MIN_MEM_POOL_SIZE < 2 * sizeof(uint32_t)) {
       return QCBOR_ERR_BUFFER_TOO_SMALL;
    }
-   MemPool *pMP = (MemPool *)Pool.ptr;
 
-   // Fill in the "vtable"
-   pMP->StringAllocator.fAllocate   = MemPool_Alloc;
-   pMP->StringAllocator.fFree       = MemPool_Free;
-   pMP->StringAllocator.fDestructor = NULL;
+   // The pool size and free offset packed in to the beginning of pool
+   // memory are only 32-bits. This check will optimize out on 32-bit
+   // machines.
+   if(Pool.len > UINT32_MAX) {
+      return QCBOR_ERR_BUFFER_TOO_LARGE;
+   }
 
-   // Set up the pointers to the memory to be allocated
-   pMP->pStart = (uint8_t *)Pool.ptr + QCBOR_DECODE_MIN_MEM_POOL_SIZE;
-   pMP->pFree  = pMP->pStart;
-   pMP->pEnd   = (uint8_t *)Pool.ptr + Pool.len;
+   // This checks that the pool buffer given is big enough.
+   if(MemPool_Pack(Pool, QCBOR_DECODE_MIN_MEM_POOL_SIZE)) {
+      return QCBOR_ERR_BUFFER_TOO_SMALL;
+   }
 
-   // More book keeping of context
-   pMP->StringAllocator.pAllocaterContext = pMP;
-   me->pStringAllocator   = pMP;
-
-   // The flag indicating when to use the allocator
-   me->bStringAllocateAll = bAllStrings;
+   pMe->StringAllocator.pfAllocator    = MemPool_Function;
+   pMe->StringAllocator.pAllocateCxt  = Pool.ptr;
+   pMe->bStringAllocateAll             = bAllStrings;
 
    return QCBOR_SUCCESS;
 }
-
-
-/*
- Extra little hook to make MemPool testing work right
- without adding any code size or overhead to non-test
- uses. This will get dead-stripped for non-test use.
-
- This is not a public function.
- */
-size_t MemPoolTestHook_GetPoolSize(void *ctx)
-{
-   MemPool *me = (MemPool *)ctx;
-
-   return me->pEnd - me->pStart;
-}
-
diff --git a/test/qcbor_decode_tests.c b/test/qcbor_decode_tests.c
index 38005c0..e9d54fb 100644
--- a/test/qcbor_decode_tests.c
+++ b/test/qcbor_decode_tests.c
@@ -2574,7 +2574,7 @@
    }
 
    // ----- Mempool is way too small -----
-   UsefulBuf_MAKE_STACK_UB(MemPoolTooSmall, 20); // 20 is too small no matter what
+   UsefulBuf_MAKE_STACK_UB(MemPoolTooSmall, QCBOR_DECODE_MIN_MEM_POOL_SIZE-1);
 
    QCBORDecode_Init(&DC, IndefLen, QCBOR_DECODE_MODE_NORMAL);
    if(!QCBORDecode_SetMemPool(&DC,  MemPoolTooSmall, false)) {
@@ -2652,7 +2652,7 @@
       return -34;
    }
 
-    return 0;
+   return 0;
 }
 
 
@@ -2704,6 +2704,8 @@
    if(Item1.uLabelType != QCBOR_TYPE_TEXT_STRING ||
       Item1.uDataType != QCBOR_TYPE_INT64 ||
       Item1.val.int64 != 42 ||
+      Item1.uDataAlloc != 0 ||
+      Item1.uLabelAlloc == 0 ||
       UsefulBuf_Compare(Item1.label.string, UsefulBuf_FromSZ("first integer"))) {
       return -4;
    }
@@ -2712,15 +2714,21 @@
    if(Item2.uLabelType != QCBOR_TYPE_TEXT_STRING ||
       UsefulBuf_Compare(Item2.label.string, UsefulBuf_FromSZ("an array of two strings")) ||
       Item2.uDataType != QCBOR_TYPE_ARRAY ||
+      Item2.uDataAlloc != 0 ||
+      Item2.uLabelAlloc == 0 ||
       Item2.val.uCount != 2)
       return -5;
 
    if(Item3.uDataType != QCBOR_TYPE_TEXT_STRING ||
+      Item3.uDataAlloc == 0 ||
+      Item3.uLabelAlloc != 0 ||
       UsefulBuf_Compare(Item3.val.string, UsefulBuf_FromSZ("string1"))) {
       return -6;
    }
 
    if(Item4.uDataType != QCBOR_TYPE_TEXT_STRING ||
+      Item4.uDataAlloc == 0 ||
+      Item4.uLabelAlloc != 0 ||
       UsefulBuf_Compare(Item4.val.string, UsefulBuf_FromSZ("string2"))) {
       return -7;
    }
@@ -2749,88 +2757,136 @@
    return 0;
 }
 
-// Cheating declaration to get to the special test hook
-size_t MemPoolTestHook_GetPoolSize(void *ctx);
 
 
 int MemPoolTest(void)
 {
-   // Set up the decoder with a tiny bit of CBOR to parse
+   // Set up the decoder with a tiny bit of CBOR to parse because
+   // nothing can be done with it unless that is set up.
    QCBORDecodeContext DC;
    const uint8_t pMinimalCBOR[] = {0xa0}; // One empty map
    QCBORDecode_Init(&DC, UsefulBuf_FROM_BYTE_ARRAY_LITERAL(pMinimalCBOR),0);
 
    // Set up an memory pool of 100 bytes
+   // Then fish into the internals of the decode context
+   // to get the allocator function so it can be called directly.
+   // Also figure out how much pool is available for use
+   // buy subtracting out the overhead.
    UsefulBuf_MAKE_STACK_UB(Pool, 100);
    QCBORError nError = QCBORDecode_SetMemPool(&DC, Pool, 0);
    if(nError) {
       return -9;
    }
+   QCBORStringAllocate pAlloc = DC.StringAllocator.pfAllocator;
+   void *pAllocCtx            = DC.StringAllocator.pAllocateCxt;
+   size_t uAvailPool = Pool.len - QCBOR_DECODE_MIN_MEM_POOL_SIZE;
 
-   // Cheat a little to get to the string allocator object
-   // so we can call it directly to test it
-   QCBORStringAllocator *pAlloc = (QCBORStringAllocator *)DC.pStringAllocator;
-   // Cheat some more to know exactly the
-   size_t uAvailPool = MemPoolTestHook_GetPoolSize(pAlloc);
-
-   // First test -- ask for too much in one go
-   UsefulBuf Allocated = (*pAlloc->fAllocate)(pAlloc->pAllocaterContext, NULL, uAvailPool+1);
+   // First test -- ask for one more byte than available and see failure
+   UsefulBuf Allocated = (*pAlloc)(pAllocCtx, NULL, uAvailPool+1);
    if(!UsefulBuf_IsNULL(Allocated)) {
       return -1;
    }
 
-
    // Re do the set up for the next test that will do a successful alloc,
    // a fail, a free and then success
-   // This test should work on 32 and 64-bit machines if the compiler
-   // does the expected thing with pointer sizes for the internal
-   // MemPool implementation leaving 44 or 72 bytes of pool memory.
    QCBORDecode_SetMemPool(&DC, Pool, 0);
+   pAlloc    = DC.StringAllocator.pfAllocator;
+   pAllocCtx = DC.StringAllocator.pAllocateCxt;
+   uAvailPool = Pool.len - QCBOR_DECODE_MIN_MEM_POOL_SIZE;
 
-   // Cheat a little to get to the string allocator object
-   // so we can call it directly to test it
-   pAlloc = (QCBORStringAllocator *)DC.pStringAllocator;
-   // Cheat some more to know exactly the
-   uAvailPool = MemPoolTestHook_GetPoolSize(pAlloc);
-
-   Allocated = (*pAlloc->fAllocate)(pAlloc->pAllocaterContext, NULL, uAvailPool-1);
+   // Allocate one byte less than available and see success
+   Allocated = (pAlloc)(pAllocCtx, NULL, uAvailPool-1);
    if(UsefulBuf_IsNULL(Allocated)) { // expected to succeed
       return -2;
    }
-   UsefulBuf Allocated2 = (*pAlloc->fAllocate)(pAlloc->pAllocaterContext, NULL, uAvailPool/2);
+   // Ask for some more and see failure
+   UsefulBuf Allocated2 = (*pAlloc)(pAllocCtx, NULL, uAvailPool/2);
    if(!UsefulBuf_IsNULL(Allocated2)) { // expected to fail
       return -3;
    }
-   (*pAlloc->fFree)(pAlloc->pAllocaterContext, Allocated.ptr);
-   Allocated = (*pAlloc->fAllocate)(pAlloc->pAllocaterContext, NULL, uAvailPool/2);
+   // Free the first allocate, retry the second and see success
+   (*pAlloc)(pAllocCtx, Allocated.ptr, 0); // Free
+   Allocated = (*pAlloc)(pAllocCtx, NULL, uAvailPool/2);
    if(UsefulBuf_IsNULL(Allocated)) { // succeed because of the free
       return -4;
    }
 
-
    // Re do set up for next test that involves a successful alloc,
    // and a successful realloc and a failed realloc
    QCBORDecode_SetMemPool(&DC, Pool, 0);
+   pAlloc    = DC.StringAllocator.pfAllocator;
+   pAllocCtx = DC.StringAllocator.pAllocateCxt;
 
-   // Cheat a little to get to the string allocator object
-   // so we can call it directly to test it
-   pAlloc = (QCBORStringAllocator *)DC.pStringAllocator;
-   Allocated = (*pAlloc->fAllocate)(pAlloc->pAllocaterContext, NULL, uAvailPool/2);
+   // Allocate half the pool and see success
+   Allocated = (*pAlloc)(pAllocCtx, NULL, uAvailPool/2);
    if(UsefulBuf_IsNULL(Allocated)) { // expected to succeed
       return -5;
    }
-   Allocated2 = (*pAlloc->fAllocate)(pAlloc->pAllocaterContext, Allocated.ptr, uAvailPool);
+   // Reallocate to take up the whole pool and see success
+   Allocated2 = (*pAlloc)(pAllocCtx, Allocated.ptr, uAvailPool);
    if(UsefulBuf_IsNULL(Allocated2)) {
       return -6;
    }
+   // Make sure its the same pointer and the size is right
    if(Allocated2.ptr != Allocated.ptr || Allocated2.len != uAvailPool) {
       return -7;
    }
-   UsefulBuf Allocated3 = (*pAlloc->fAllocate)(pAlloc->pAllocaterContext, Allocated.ptr, uAvailPool+1);
-   if(!UsefulBuf_IsNULL(Allocated3)) { // expected to fail
+   // Try to allocate more to be sure there is failure after a realloc
+   UsefulBuf Allocated3 = (*pAlloc)(pAllocCtx, Allocated.ptr, uAvailPool+1);
+   if(!UsefulBuf_IsNULL(Allocated3)) {
       return -8;
    }
 
    return 0;
 }
 
+
+/* Just enough of an allocator to test configuration of one */
+static UsefulBuf AllocateTestFunction(void *pCtx, void *pOldMem, size_t uNewSize)
+{
+   (void)pOldMem; // unused variable
+
+   if(uNewSize) {
+      // Assumes the context pointer is the buffer and
+      // nothing too big will ever be asked for.
+      // This is only good for this basic test!
+      return (UsefulBuf) {pCtx, uNewSize};
+   } else {
+      return NULLUsefulBuf;
+   }
+}
+
+
+int SetUpAllocatorTest(void)
+{
+   // Set up the decoder with a tiny bit of CBOR to parse because
+   // nothing can be done with it unless that is set up.
+   QCBORDecodeContext DC;
+   const uint8_t pMinimalCBOR[] = {0x62, 0x48, 0x69}; // "Hi"
+   QCBORDecode_Init(&DC, UsefulBuf_FROM_BYTE_ARRAY_LITERAL(pMinimalCBOR),0);
+
+   uint8_t pAllocatorBuffer[50];
+
+   // This is really just to test that this call works.
+   // The full functionality of string allocators is tested
+   // elsewhere with the MemPool internal allocator.
+   QCBORDecode_SetUpAllocator(&DC, AllocateTestFunction, pAllocatorBuffer, 1);
+
+   QCBORItem Item;
+   if(QCBORDecode_GetNext(&DC, &Item) != QCBOR_SUCCESS) {
+      return -1;
+   }
+
+   if(Item.uDataAlloc == 0 ||
+      Item.uDataType != QCBOR_TYPE_TEXT_STRING ||
+      Item.val.string.ptr != pAllocatorBuffer) {
+      return -2;
+   }
+
+   if(QCBORDecode_Finish(&DC) != QCBOR_SUCCESS) {
+      return -3;
+   }
+
+   return 0;
+}
+
diff --git a/test/qcbor_decode_tests.h b/test/qcbor_decode_tests.h
index a7ee0b5..b76b963 100644
--- a/test/qcbor_decode_tests.h
+++ b/test/qcbor_decode_tests.h
@@ -226,4 +226,12 @@
 int MemPoolTest(void);
 
 
+/*
+ Test the setting up of an external string allocator.
+ */
+int SetUpAllocatorTest(void);
+
+
+
+
 #endif /* defined(__QCBOR__qcbort_decode_tests__) */
diff --git a/test/run_tests.c b/test/run_tests.c
index c2816ba..6894a5f 100644
--- a/test/run_tests.c
+++ b/test/run_tests.c
@@ -164,6 +164,7 @@
     TEST_ENTRY(StringDecoderModeFailTest),
     TEST_ENTRY_DISABLED(BigComprehensiveInputTest),
     TEST_ENTRY(EncodeErrorTests),
+    TEST_ENTRY(SetUpAllocatorTest),
     //TEST_ENTRY(fail_test),
 };
 
@@ -301,7 +302,6 @@
    PrintSize("sizeof(QCBORDecodeNesting)",  (uint32_t)sizeof(QCBORDecodeNesting), pfOutput, pOutCtx);
    PrintSize("sizeof(QCBORDecodeContext)",  (uint32_t)sizeof(QCBORDecodeContext), pfOutput, pOutCtx);
    PrintSize("sizeof(QCBORItem)",           (uint32_t)sizeof(QCBORItem), pfOutput, pOutCtx);
-   PrintSize("sizeof(QCBORStringAllocator)",(uint32_t)sizeof(QCBORStringAllocator), pfOutput, pOutCtx);
    PrintSize("sizeof(QCBORTagListIn)",      (uint32_t)sizeof(QCBORTagListIn), pfOutput, pOutCtx);
    PrintSize("sizeof(QCBORTagListOut)",     (uint32_t)sizeof(QCBORTagListOut), pfOutput, pOutCtx);
    (*pfOutput)("", pOutCtx, 1);