From 79b80f84df7de0618596b293566062a3ea460958 Mon Sep 17 00:00:00 2001
From: Wouter van Oortmerssen <aardappel@gmail.com>
Date: Thu, 28 Dec 2017 14:38:04 -0800
Subject: [PATCH] Reduced FlatBufferBuilder from 3 buffers to 1

Previously, FlatBufferBuilder used 3 resizable buffers:
- serialization (vector_downward)
- field offsets (std::vector)
- vtable offsets (std::vector)

Since the serialization buffer grows downwards, the bottom part of
it can be used as a "scratchpad" storage for the other two. Since
field offsets are only accumulated during table construction, and
vtable offsets only after table construction, the two can trivially
share the same storage.

Not only does this reduce the amount of allocation, it also removes
the bulk of std::vector usage from FlatBufferBuilder which was
the #1 cause of slow-down in debug mode, see e.g.:
https://stackoverflow.com/questions/36085285/any-way-to-improve-flatbuffer-performance-in-debug-c-msvc

Change-Id: I0224cf2f2a863d2d7ef762bc9163b52fdc149522
Tested: on Linux.
---
 include/flatbuffers/flatbuffers.h | 112 ++++++++++++++++++++----------
 1 file changed, 74 insertions(+), 38 deletions(-)

diff --git a/include/flatbuffers/flatbuffers.h b/include/flatbuffers/flatbuffers.h
index cdf2c8cf..c1ed1d19 100644
--- a/include/flatbuffers/flatbuffers.h
+++ b/include/flatbuffers/flatbuffers.h
@@ -363,11 +363,16 @@ class Allocator {
   // Reallocate `new_size` bytes of memory, replacing the old region of size
   // `old_size` at `p`. In contrast to a normal realloc, this grows downwards,
   // and is intended specifcally for `vector_downward` use.
+  // `in_use_back` and `in_use_front` indicate how much of `old_size` is
+  // actually in use at each end, and needs to be copied.
   virtual uint8_t *reallocate_downward(uint8_t *old_p, size_t old_size,
-                                       size_t new_size) {
+                                       size_t new_size, size_t in_use_back,
+                                       size_t in_use_front) {
     assert(new_size > old_size);  // vector_downward only grows
     uint8_t *new_p = allocate(new_size);
-    memcpy(new_p + (new_size - old_size), old_p, old_size);
+    memcpy(new_p + new_size - in_use_back, old_p + old_size - in_use_back,
+           in_use_back);
+    memcpy(new_p, old_p, in_use_front);
     deallocate(old_p, old_size);
     return new_p;
   }
@@ -503,6 +508,9 @@ class DetachedBuffer {
 // This is a minimal replication of std::vector<uint8_t> functionality,
 // except growing from higher to lower addresses. i.e push_back() inserts data
 // in the lowest address in the vector.
+// Since this vector leaves the lower part unused, we support a "scratch-pad"
+// that can be stored there for temporary data, to share the allocated space.
+// Essentially, this supports 2 std::vectors in a single buffer.
 class vector_downward {
  public:
   explicit vector_downward(size_t initial_size = 1024,
@@ -513,7 +521,8 @@ class vector_downward {
         initial_size_(initial_size),
         reserved_(0),
         buf_(nullptr),
-        cur_(nullptr) {
+        cur_(nullptr),
+        scratch_(nullptr) {
     assert(allocator_);
   }
 
@@ -533,15 +542,18 @@ class vector_downward {
     reserved_ = 0;
     buf_ = nullptr;
     cur_ = nullptr;
+    scratch_ = nullptr;
   }
 
   void clear() {
     if (buf_) {
       cur_ = buf_ + reserved_;
+      scratch_ = buf_;
     } else {
       reserved_ = 0;
       buf_ = nullptr;
       cur_ = nullptr;
+      scratch_ = nullptr;
     }
   }
 
@@ -554,6 +566,7 @@ class vector_downward {
     reserved_ = 0;
     buf_ = nullptr;
     cur_ = nullptr;
+    scratch_ = nullptr;
     return fb;
   }
 
@@ -562,13 +575,17 @@ class vector_downward {
                         : ((bytes / 2) & ~(AlignOf<largest_scalar_t>() - 1));
   }
 
-  uint8_t *make_space(size_t len) {
-    assert(cur_ >= buf_);
-    if (len > static_cast<size_t>(cur_ - buf_)) { reallocate(len); }
-    cur_ -= len;
+  size_t ensure_space(size_t len) {
+    assert(cur_ >= scratch_ && scratch_ >= buf_);
+    if (len > static_cast<size_t>(cur_ - scratch_)) { reallocate(len); }
     // Beyond this, signed offsets may not have enough range:
     // (FlatBuffers > 2GB not supported).
     assert(size() < FLATBUFFERS_MAX_BUFFER_SIZE);
+    return len;
+  }
+
+  inline uint8_t *make_space(size_t len) {
+    cur_ -= ensure_space(len);
     return cur_;
   }
 
@@ -578,45 +595,59 @@ class vector_downward {
     return static_cast<uoffset_t>(reserved_ - (cur_ - buf_));
   }
 
+  uoffset_t scratch_size() const {
+    return static_cast<uoffset_t>(scratch_ - buf_);
+  }
+
   size_t capacity() const { return reserved_; }
 
-  uint8_t *buf() const {
+  uint8_t *data() const {
+    assert(cur_);
+    return cur_;
+  }
+
+  uint8_t *scratch_data() const {
     assert(buf_);
     return buf_;
   }
 
-  uint8_t *data() const {
-    assert(cur_);
-    return cur_;
+  uint8_t *scratch_end() const {
+    assert(scratch_);
+    return scratch_;
   }
 
   uint8_t *data_at(size_t offset) const { return buf_ + reserved_ - offset; }
 
   void push(const uint8_t *bytes, size_t num) {
-    auto dest = make_space(num);
-    memcpy(dest, bytes, num);
+    memcpy(make_space(num), bytes, num);
   }
 
   // Specialized version of push() that avoids memcpy call for small data.
   template<typename T> void push_small(const T &little_endian_t) {
-    auto dest = make_space(sizeof(T));
-    *reinterpret_cast<T *>(dest) = little_endian_t;
+    make_space(sizeof(T));
+    *reinterpret_cast<T *>(cur_) = little_endian_t;
+  }
+
+  template<typename T> void scratch_push_small(const T &t) {
+    ensure_space(sizeof(T));
+    *reinterpret_cast<T *>(scratch_) = t;
+    scratch_ += sizeof(T);
   }
 
   // fill() is most frequently called with small byte counts (<= 4),
   // which is why we're using loops rather than calling memset.
   void fill(size_t zero_pad_bytes) {
-    auto dest = make_space(zero_pad_bytes);
-    for (size_t i = 0; i < zero_pad_bytes; i++) dest[i] = 0;
+    make_space(zero_pad_bytes);
+    for (size_t i = 0; i < zero_pad_bytes; i++) cur_[i] = 0;
   }
 
   // Version for when we know the size is larger.
   void fill_big(size_t zero_pad_bytes) {
-    auto dest = make_space(zero_pad_bytes);
-    memset(dest, 0, zero_pad_bytes);
+    memset(make_space(zero_pad_bytes), 0, zero_pad_bytes);
   }
 
   void pop(size_t bytes_to_remove) { cur_ += bytes_to_remove; }
+  void scratch_pop(size_t bytes_to_remove) { scratch_ -= bytes_to_remove; }
 
  private:
   // You shouldn't really be copying instances of this class.
@@ -629,20 +660,24 @@ class vector_downward {
   size_t reserved_;
   uint8_t *buf_;
   uint8_t *cur_;  // Points at location between empty (below) and used (above).
+  uint8_t *scratch_;  // Points to the end of the scratchpad in use.
 
   void reallocate(size_t len) {
     assert(allocator_);
     auto old_reserved = reserved_;
     auto old_size = size();
+    auto old_scratch_size = scratch_size();
     reserved_ += (std::max)(len, growth_policy(old_reserved));
     FLATBUFFERS_CONSTEXPR size_t alignment = AlignOf<largest_scalar_t>();
     reserved_ = (reserved_ + alignment - 1) & ~(alignment - 1);
     if (buf_) {
-      buf_ = allocator_->reallocate_downward(buf_, old_reserved, reserved_);
+      buf_ = allocator_->reallocate_downward(buf_, old_reserved, reserved_,
+                                             old_size, old_scratch_size);
     } else {
       buf_ = allocator_->allocate(reserved_);
     }
     cur_ = buf_ + reserved_ - old_size;
+    scratch_ = buf_ + old_scratch_size;
   }
 };
 
@@ -685,6 +720,7 @@ class FlatBufferBuilder {
                              Allocator *allocator = nullptr,
                              bool own_allocator = false)
       : buf_(initial_size, allocator, own_allocator),
+        num_field_loc(0),
         max_voffset_(0),
         nested(false),
         finished(false),
@@ -692,8 +728,6 @@ class FlatBufferBuilder {
         force_defaults_(false),
         dedup_vtables_(true),
         string_pool(nullptr) {
-    offsetbuf_.reserve(16);  // Avoid first few reallocs.
-    vtables_.reserve(16);
     EndianCheck();
   }
 
@@ -709,11 +743,10 @@ class FlatBufferBuilder {
   /// @brief Reset all the state in this FlatBufferBuilder so it can be reused
   /// to construct another buffer.
   void Clear() {
-    buf_.clear();
     ClearOffsets();
+    buf_.clear();
     nested = false;
     finished = false;
-    vtables_.clear();
     minalign_ = 1;
     if (string_pool) string_pool->clear();
   }
@@ -821,7 +854,8 @@ class FlatBufferBuilder {
   // vtables later.
   void TrackField(voffset_t field, uoffset_t off) {
     FieldLoc fl = { off, field };
-    offsetbuf_.push_back(fl);
+    buf_.scratch_push_small(fl);
+    num_field_loc++;
     max_voffset_ = (std::max)(max_voffset_, field);
   }
 
@@ -870,6 +904,8 @@ class FlatBufferBuilder {
     // it is here is that storing objects in-line may cause vtable offsets
     // to not fit anymore. It also leads to vtable duplication.
     assert(!nested);
+    // If you hit this, fields were added outside the scope of a table.
+    assert(!num_field_loc);
   }
 
   // From generated code (or from the parser), we call StartTable/EndTable
@@ -904,8 +940,9 @@ class FlatBufferBuilder {
                            static_cast<voffset_t>(table_object_size));
     WriteScalar<voffset_t>(buf_.data(), max_voffset_);
     // Write the offsets into the table
-    for (auto field_location = offsetbuf_.begin();
-         field_location != offsetbuf_.end(); ++field_location) {
+    for (auto it = buf_.scratch_end() - num_field_loc * sizeof(FieldLoc);
+         it < buf_.scratch_end(); it += sizeof(FieldLoc)) {
+      auto field_location = reinterpret_cast<FieldLoc *>(it);
       auto pos = static_cast<voffset_t>(vtableoffsetloc - field_location->off);
       // If this asserts, it means you've set a field twice.
       assert(!ReadScalar<voffset_t>(buf_.data() + field_location->id));
@@ -918,17 +955,19 @@ class FlatBufferBuilder {
     // See if we already have generated a vtable with this exact same
     // layout before. If so, make it point to the old one, remove this one.
     if (dedup_vtables_) {
-      for (auto it = vtables_.begin(); it != vtables_.end(); ++it) {
-        auto vt2 = reinterpret_cast<voffset_t *>(buf_.data_at(*it));
+      for (auto it = buf_.scratch_data(); it < buf_.scratch_end();
+           it += sizeof(uoffset_t)) {
+        auto vt_offset_ptr = reinterpret_cast<uoffset_t *>(it);
+        auto vt2 = reinterpret_cast<voffset_t *>(buf_.data_at(*vt_offset_ptr));
         auto vt2_size = *vt2;
         if (vt1_size != vt2_size || memcmp(vt2, vt1, vt1_size)) continue;
-        vt_use = *it;
+        vt_use = *vt_offset_ptr;
         buf_.pop(GetSize() - vtableoffsetloc);
         break;
       }
     }
     // If this is a new vtable, remember it.
-    if (vt_use == GetSize()) { vtables_.push_back(vt_use); }
+    if (vt_use == GetSize()) { buf_.scratch_push_small(vt_use); }
     // Fill the vtable offset we created above.
     // The offset points from the beginning of the object to where the
     // vtable is stored.
@@ -966,7 +1005,8 @@ class FlatBufferBuilder {
   uoffset_t EndStruct() { return GetSize(); }
 
   void ClearOffsets() {
-    offsetbuf_.clear();
+    buf_.scratch_pop(num_field_loc * sizeof(FieldLoc));
+    num_field_loc = 0;
     max_voffset_ = 0;
   }
 
@@ -1105,9 +1145,6 @@ class FlatBufferBuilder {
     PreAlign(len * elemsize, alignment);
   }
 
-  uint8_t *ReserveElements(size_t len, size_t elemsize) {
-    return buf_.make_space(len * elemsize);
-  }
   /// @endcond
 
   /// @brief Serialize an array into a FlatBuffer `vector`.
@@ -1519,7 +1556,8 @@ class FlatBufferBuilder {
   vector_downward buf_;
 
   // Accumulating offsets of table members while it is being built.
-  std::vector<FieldLoc> offsetbuf_;
+  // We store these in the scratch pad of buf_, after the vtable offsets.
+  uoffset_t num_field_loc;
   // Track how much of the vtable is in use, so we can output the most compact
   // possible vtable.
   voffset_t max_voffset_;
@@ -1530,8 +1568,6 @@ class FlatBufferBuilder {
   // Ensure the buffer is finished before it is being accessed.
   bool finished;
 
-  std::vector<uoffset_t> vtables_;  // todo: Could make this into a map?
-
   size_t minalign_;
 
   bool force_defaults_;  // Serialize values equal to their defaults anyway.
-- 
GitLab