20 files changed, 203 insertions, 140 deletions
diff --git a/Theron b/Theron
-Subproject e31d0a51e9ea245e7a981365fb58023d24e8649
+Subproject 1a87688a68c58305947ec2d3eb83e51d8724655
diff --git a/benchmark/bm_registry.py b/benchmark/bm_registry.py
index 7c9bb1c..da7efe4 100644
--- a/benchmark/bm_registry.py
+++ b/benchmark/bm_registry.py
@@ -6,7 +6,7 @@ def tokwargs(**kwargs): return kwargs
 
 INSTALL_PREFIX = '/opt/usr'
 
-DURATION = '1.7' #seconds
+DURATION = '1.3' #seconds
 
 GRAS_ENV = {
     'PATH': os.path.join(INSTALL_PREFIX, 'gras/bin:$PATH:%s'%os.getenv('PATH')),
@@ -20,15 +20,27 @@ GR_ENV = {
     'PYTHONPATH': os.path.join(INSTALL_PREFIX, 'gr/lib/python2.7/dist-packages:%s'%os.getenv('PYTHONPATH')),
 }
 
-BENCHMARK_MANY_11_BLOCKS = tokwargs(
-    wat='Benchmark the schedulers with many 1:1 ratio blocks',
+BENCHMARK_LINEAR_CHAIN = tokwargs(
+    wat='Benchmark the schedulers with linear chain topology',
     moar='''\
-- Compare simultaneous 1:1 ratio blocks in each scheduler.
+- Topology is a linear chain of one input/one output blocks.
 - GRAS will use only the buffer pool allocator,
 and every work will fully consume available buffers.''',
     tests = [
-        tokwargs(wat='GRAS',     args=['tb_many_1_to_1_blocks.py', DURATION], env=GRAS_ENV),
-        tokwargs(wat='GR',       args=['tb_many_1_to_1_blocks.py', DURATION], env=GR_ENV),
+        tokwargs(wat='GRAS',     args=['tb_linear_chain.py', DURATION], env=GRAS_ENV, expand=True),
+        tokwargs(wat='GRSS',     args=['tb_linear_chain.py', DURATION], env=GR_ENV),
+    ],
+)
+
+BENCHMARK_COMBINER_ARRAY = tokwargs(
+    wat='Benchmark the schedulers with combiner array topology',
+    moar='''\
+- Topology is a tower of two input math blocks.
+- GRAS will use only the buffer pool allocator,
+and every work will fully consume available buffers.''',
+    tests = [
+        tokwargs(wat='GRAS',     args=['tb_combiner_array.py', DURATION], env=GRAS_ENV, expand=True),
+        tokwargs(wat='GRSS',     args=['tb_combiner_array.py', DURATION], env=GR_ENV),
     ],
 )
 
@@ -39,40 +51,58 @@ BENCHMARK_MANY_RATE_BLOCKS = tokwargs(
 - GRAS will use only the buffer pool allocator,
 and every work will fully consume available buffers.''',
     tests = [
-        tokwargs(wat='GRAS',     args=['tb_many_rate_changes.py', '--dur', DURATION], env=GRAS_ENV),
-        tokwargs(wat='GR',       args=['tb_many_rate_changes.py', '--dur', DURATION], env=GR_ENV),
+        tokwargs(wat='GRAS',     args=['tb_many_rate_changes.py', '--dur', DURATION], env=GRAS_ENV, expand=True),
+        tokwargs(wat='GRSS',     args=['tb_many_rate_changes.py', '--dur', DURATION], env=GR_ENV),
+    ],
+)
+
+BENCHMARK_DFIR_BLOCK = tokwargs(
+    wat='Benchmark the schedulers with a FIR block',
+    moar='''\
+- Compare filter blocks in each scheduler.
+- Shows both schedulers using circular buffer.''',
+    tests = [
+        tokwargs(wat='GRAS',     args=['tb_filter_block.py', '--dur', DURATION, '--which', 'dfir'], env=GRAS_ENV, expand=True),
+        tokwargs(wat='GRSS',     args=['tb_filter_block.py', '--dur', DURATION, '--which', 'dfir'], env=GR_ENV),
     ],
 )
 
-BENCHMARK_FILTER_BLOCK = tokwargs(
-    wat='Benchmark the schedulers with a filter block',
+BENCHMARK_RESAMP_BLOCK = tokwargs(
+    wat='Benchmark the schedulers with a resampler block',
     moar='''\
 - Compare filter blocks in each scheduler.
-- Shows both schedulers using circular buffer.
-- The decimating FIR filter is compared.
-- The rational resampler filter is compared.''',
+- Shows both schedulers using circular buffer.''',
+    tests = [
+        tokwargs(wat='GRAS',     args=['tb_filter_block.py', '--dur', DURATION, '--which', 'resamp'], env=GRAS_ENV, expand=True),
+        tokwargs(wat='GRSS',     args=['tb_filter_block.py', '--dur', DURATION, '--which', 'resamp'], env=GR_ENV),
+    ],
+)
+
+BENCHMARK_ADD_OPS = tokwargs(
+    wat='Benchmark GrExtras vs gr-blocks adder blocks',
+    moar='''\
+- Compare math block implementations using GRAS.
+- All blocks are using vector optimization.
+- GrExtras math blocks avoid an unnecessary memcpy.
+- GrExtras math blocks enable automatic bufer in-placing.''',
     tests = [
-        tokwargs(wat='GRAS decim FIR',     args=['tb_filter_block.py', '--dur', DURATION, '--which', 'dfir'], env=GRAS_ENV),
-        tokwargs(wat='GR decim FIR',       args=['tb_filter_block.py', '--dur', DURATION, '--which', 'dfir'], env=GR_ENV),
-        tokwargs(wat='GRAS resampler',     args=['tb_filter_block.py', '--dur', DURATION, '--which', 'resamp'], env=GRAS_ENV),
-        tokwargs(wat='GR resampler',       args=['tb_filter_block.py', '--dur', DURATION, '--which', 'resamp'], env=GR_ENV),
+        tokwargs(wat='GrExtras\n(GRAS)',        args=['tb_grextras_math.py', DURATION, 'extras_add'], env=GRAS_ENV, expand=True),
+        tokwargs(wat='gr-blocks\n(GRAS)',       args=['tb_grextras_math.py', DURATION, 'blocks_add'], env=GRAS_ENV, expand=True),
+        tokwargs(wat='gr-blocks\n(GRSS)',       args=['tb_grextras_math.py', DURATION, 'blocks_add'], env=GR_ENV),
     ],
 )
 
-BENCHMARK_MATH_OPS = tokwargs(
-    wat='Benchmark GrExtras vs gr-blocks math blocks',
+BENCHMARK_MULT_OPS = tokwargs(
+    wat='Benchmark GrExtras vs gr-blocks multiplier blocks',
     moar='''\
 - Compare math block implementations using GRAS.
 - All blocks are using vector optimization.
 - GrExtras math blocks avoid an unnecessary memcpy.
 - GrExtras math blocks enable automatic bufer in-placing.''',
     tests = [
-        tokwargs(wat='GrExtras Add\n(GRAS)',        args=['tb_grextras_math.py', DURATION, 'extras_add'], env=GRAS_ENV),
-        tokwargs(wat='gr-blocks Add\n(GRAS)',       args=['tb_grextras_math.py', DURATION, 'blocks_add'], env=GRAS_ENV),
-        tokwargs(wat='gr-blocks Add\n(GR)',         args=['tb_grextras_math.py', DURATION, 'blocks_add'], env=GR_ENV),
-        tokwargs(wat='GrExtras Mult\n(GRAS)',       args=['tb_grextras_math.py', DURATION, 'extras_mult'], env=GRAS_ENV),
-        tokwargs(wat='gr-blocks Mult\n(GRAS)',      args=['tb_grextras_math.py', DURATION, 'blocks_mult'], env=GRAS_ENV),
-        tokwargs(wat='gr-blocks Mult\n(GR)',        args=['tb_grextras_math.py', DURATION, 'blocks_mult'], env=GR_ENV),
+        tokwargs(wat='GrExtras\n(GRAS)',       args=['tb_grextras_math.py', DURATION, 'extras_mult'], env=GRAS_ENV, expand=True),
+        tokwargs(wat='gr-blocks\n(GRAS)',      args=['tb_grextras_math.py', DURATION, 'blocks_mult'], env=GRAS_ENV, expand=True),
+        tokwargs(wat='gr-blocks\n(GRSS)',      args=['tb_grextras_math.py', DURATION, 'blocks_mult'], env=GR_ENV),
     ],
 )
 
@@ -82,16 +112,19 @@ BENCHMARK_DELAY_BLOCKS = tokwargs(
 - Compare delay block implementations using GRAS.
 - The GrExtras implementation uses zero-copy.''',
     tests = [
-        tokwargs(wat='GrExtras Delay\n(GRAS)',          args=['tb_grextras_delay.py', DURATION, 'extras_delay'], env=GRAS_ENV),
-        tokwargs(wat='gr-core Delay\n(GRAS)',           args=['tb_grextras_delay.py', DURATION, 'core_delay'], env=GRAS_ENV),
-        tokwargs(wat='gr-core Delay\n(GR)',             args=['tb_grextras_delay.py', DURATION, 'core_delay'], env=GR_ENV),
+        tokwargs(wat='GrExtras\n(GRAS)',          args=['tb_grextras_delay.py', DURATION, 'extras_delay'], env=GRAS_ENV, expand=True),
+        tokwargs(wat='gr-core\n(GRAS)',           args=['tb_grextras_delay.py', DURATION, 'core_delay'], env=GRAS_ENV, expand=True),
+        tokwargs(wat='gr-core\n(GRSS)',           args=['tb_grextras_delay.py', DURATION, 'core_delay'], env=GR_ENV),
     ],
 )
 
 BENCHMARKS = (
-    BENCHMARK_MANY_11_BLOCKS,
+    BENCHMARK_LINEAR_CHAIN,
+    BENCHMARK_COMBINER_ARRAY,
     BENCHMARK_MANY_RATE_BLOCKS,
-    BENCHMARK_FILTER_BLOCK,
-    BENCHMARK_MATH_OPS,
+    BENCHMARK_DFIR_BLOCK,
+    BENCHMARK_RESAMP_BLOCK,
+    BENCHMARK_ADD_OPS,
+    BENCHMARK_MULT_OPS,
     BENCHMARK_DELAY_BLOCKS,
 )
diff --git a/benchmark/run_benchmarks.py b/benchmark/run_benchmarks.py
index 8cb3503..779aa6a 100644
--- a/benchmark/run_benchmarks.py
+++ b/benchmark/run_benchmarks.py
@@ -15,7 +15,7 @@ cpu_count = multiprocessing.cpu_count()
 
 from bm_registry import BENCHMARKS
 
-NUM_RUNS_PER_TEST = 5
+NUM_RUNS_PER_TEST = 3
 
 BAD_BOOST_KILL_DURATION = 5.0 #seconds
 
@@ -45,6 +45,25 @@ def run_a_single_one(args, env):
     raise Exception, 'no result found!'
     #return t1-t0
 
+def expand_tests(bm):
+    for run in bm['tests']:
+        if run.has_key('expand') and run['expand']:
+            import copy
+            new_run = copy.deepcopy(run)
+            new_run['wat'] += '\n(Block)'
+            new_run['env']['GRAS_YIELD'] = 'BLOCKING'
+            yield new_run
+            new_run = copy.deepcopy(run)
+            new_run['wat'] += '\n(Spin)'
+            new_run['env']['GRAS_YIELD'] = 'STRONG'
+            yield new_run
+            new_run = copy.deepcopy(run)
+            new_run['wat'] += '\n(TPB)'
+            new_run['env']['GRAS_YIELD'] = 'BLOCKING'
+            new_run['env']['GRAS_TPP'] = '1'
+            yield new_run
+        else: yield run
+
 def do_a_benchmark(bm):
     title = bm['wat']
     print '#'*(len(title)+25)
@@ -53,7 +72,7 @@ def do_a_benchmark(bm):
     result_means = list()
     result_stddevs = list()
     test_names = list()
-    for run in bm['tests']:
+    for run in expand_tests(bm):
         test_name = run['wat']
         print '-'*(len(test_name)+25)
         print '-- running test:', test_name.replace('\n', ' ')
diff --git a/benchmark/tb_many_1_to_1_blocks.py b/benchmark/tb_combiner_array.py
index c545d7e..c545d7e 100644
--- a/benchmark/tb_many_1_to_1_blocks.py
+++ b/benchmark/tb_combiner_array.py
diff --git a/benchmark/tb_linear_chain.py b/benchmark/tb_linear_chain.py
new file mode 100644
index 0000000..184b6e1
--- /dev/null
+++ b/benchmark/tb_linear_chain.py
@@ -0,0 +1,27 @@
+import gnuradio
+from gnuradio import gr
+from gnuradio import blocks as grblocks
+import sys
+
+if __name__ == '__main__':
+
+    duration = float(sys.argv[1])
+
+    tb = gr.top_block()
+    src = gr.null_source(8)
+    b0 = gr.copy(8)
+    b1 = grblocks.sub_cc()
+    b2 = gr.copy(8)
+    b3 = grblocks.divide_cc()
+    b4 = gr.copy(8)
+    sink = gr.null_sink(8)
+
+    tb.connect(src, b0, b1, b2, b3, b4, sink)
+
+    import time
+    tb.start()
+    time.sleep(duration)
+    print '##RESULT##', sink.nitems_read(0)/duration
+    import sys; sys.stdout.flush()
+    tb.stop()
+    tb.wait()
diff --git a/benchmark/tb_many_rate_changes.py b/benchmark/tb_many_rate_changes.py
index e94179f..2930dd6 100644
--- a/benchmark/tb_many_rate_changes.py
+++ b/benchmark/tb_many_rate_changes.py
@@ -30,15 +30,13 @@ class many_rate_changing(gr.top_block):
 		self.gr_unpacked_to_packed_xx_0 = gr.unpacked_to_packed_bb(2, gr.GR_LSB_FIRST)
 		self.gr_packed_to_unpacked_xx_0 = gr.packed_to_unpacked_bb(2, gr.GR_MSB_FIRST)
 		self.gr_null_sink_0_2 = gr.null_sink(gr.sizeof_char*1)
-		self.blocks_keep_m_in_n_0 = blocks.keep_m_in_n(gr.sizeof_float, 3, 20, 0)
 		self.blocks_float_to_char_0 = blocks.float_to_char(1, 1)
 		self.blocks_char_to_float_0 = blocks.char_to_float(1, 1)
 
 		##################################################
 		# Connections
 		##################################################
-		self.connect((self.blocks_char_to_float_0, 0), (self.blocks_keep_m_in_n_0, 0))
-		self.connect((self.blocks_keep_m_in_n_0, 0), (self.blocks_float_to_char_0, 0))
+		self.connect((self.blocks_char_to_float_0, 0), (self.blocks_float_to_char_0, 0))
 		self.connect((self.blocks_float_to_char_0, 0), (self.gr_packed_to_unpacked_xx_0, 0))
 		self.connect((self.gr_unpacked_to_packed_xx_0, 0), (self.blocks_char_to_float_0, 0))
 		self.connect((self.random_source_x_0, 0), (self.gr_unpacked_to_packed_xx_0, 0))
diff --git a/include/gras/thread_pool.hpp b/include/gras/thread_pool.hpp
index dd623d3..abb4d99 100644
--- a/include/gras/thread_pool.hpp
+++ b/include/gras/thread_pool.hpp
@@ -47,10 +47,11 @@ struct GRAS_API ThreadPoolConfig
 
     /*!
      * Yield strategy employed by the worker threads in the framework.
+     * BLOCKING,            ///< Threads block on condition variables.
      * POLITE,              ///< Threads go to sleep when not in use.
      * STRONG,              ///< Threads yield to other threads but don't go to sleep.
      * AGGRESSIVE           ///< Threads never yield to other threads.
-     * Default is STRONG.
+     * Default is BLOCKING.
      */
     std::string yield_strategy;
 };
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 6a9006c..e721769 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -24,6 +24,8 @@ add_definitions(${THERON_DEFINES})
 list(APPEND GRAS_LIBRARIES ${THERON_LIBRARIES})
 list(APPEND GRAS_SOURCES ${THERON_SOURCES})
 
+add_definitions(-DTHERON_ENABLE_DEFAULTALLOCATOR_CHECKS=1)
+
 ########################################################################
 # Setup Apology Deps
 ########################################################################
@@ -62,7 +64,6 @@ list(APPEND GRAS_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/top_block.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/top_block_query.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/register_messages.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/theron_allocator.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/weak_container.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/serialize_types.cpp
 )
diff --git a/lib/block_actor.cpp b/lib/block_actor.cpp
index 750e52d..fe7f79e 100644
--- a/lib/block_actor.cpp
+++ b/lib/block_actor.cpp
@@ -14,7 +14,7 @@ ThreadPoolConfig::ThreadPoolConfig(void)
     thread_count = std::max(size_t(2), thread_count);
     node_mask = 0;
     processor_mask = 0xffffffff;
-    yield_strategy = "STRONG";
+    yield_strategy = "BLOCKING";
 
     //environment variable override
     const char * gras_yield = getenv("GRAS_YIELD");
@@ -44,7 +44,7 @@ ThreadPool::ThreadPool(const ThreadPoolConfig &config)
     );
 
     if (config.yield_strategy.empty()) params.mYieldStrategy = Theron::YIELD_STRATEGY_STRONG;
-    //else if (config.yield_strategy == "BLOCKING") params.mYieldStrategy = Theron::YIELD_STRATEGY_BLOCKING;
+    else if (config.yield_strategy == "BLOCKING") params.mYieldStrategy = Theron::YIELD_STRATEGY_BLOCKING;
     else if (config.yield_strategy == "POLITE") params.mYieldStrategy = Theron::YIELD_STRATEGY_POLITE;
     else if (config.yield_strategy == "STRONG") params.mYieldStrategy = Theron::YIELD_STRATEGY_STRONG;
     else if (config.yield_strategy == "AGGRESSIVE") params.mYieldStrategy = Theron::YIELD_STRATEGY_AGGRESSIVE;
diff --git a/lib/block_handlers.cpp b/lib/block_handlers.cpp
index ffa400e..d548ee0 100644
--- a/lib/block_handlers.cpp
+++ b/lib/block_handlers.cpp
@@ -147,6 +147,8 @@ void BlockActor::handle_get_stats(
         this->stats.tags_enqueued[i] = this->input_tags[i].size();
         this->stats.msgs_enqueued[i] = this->input_msgs[i].size();
     }
+    this->stats.actor_queue_depth = this->GetNumQueuedMessages();
+    this->stats.bytes_copied = this->input_queues.bytes_copied;
 
     //create the message reply object
     GetStatsMessage message;
diff --git a/lib/gras_impl/block_actor.hpp b/lib/gras_impl/block_actor.hpp
index e5dbaac..170ee1f 100644
--- a/lib/gras_impl/block_actor.hpp
+++ b/lib/gras_impl/block_actor.hpp
@@ -129,6 +129,8 @@ struct BlockActor : Apology::Worker
     OutputBufferQueues output_queues;
     std::vector<bool> produce_outputs;
     BitSet inputs_available;
+    std::vector<time_ticks_t> time_input_not_ready;
+    std::vector<time_ticks_t> time_output_not_ready;
 
     //tag and msg tracking
     std::vector<bool> input_tags_changed;
diff --git a/lib/gras_impl/input_buffer_queues.hpp b/lib/gras_impl/input_buffer_queues.hpp
index e45bc06..f9ae3a2 100644
--- a/lib/gras_impl/input_buffer_queues.hpp
+++ b/lib/gras_impl/input_buffer_queues.hpp
@@ -165,6 +165,7 @@ struct InputBufferQueues
     std::vector<boost::circular_buffer<SBuffer> > _queues;
     std::vector<size_t> _preload_bytes;
     std::vector<boost::shared_ptr<SimpleBufferQueue> > _aux_queues;
+    std::vector<item_index_t> bytes_copied;
 };
 
 
@@ -178,6 +179,7 @@ GRAS_FORCE_INLINE void InputBufferQueues::resize(const size_t size)
     _preload_bytes.resize(size, 0);
     _reserve_bytes.resize(size, 1);
     _maximum_bytes.resize(size, MAX_AUX_BUFF_BYTES);
+    bytes_copied.resize(size);
 }
 
 inline void InputBufferQueues::update_config(
@@ -254,6 +256,7 @@ GRAS_FORCE_INLINE void InputBufferQueues::accumulate(const size_t i)
         SBuffer &front = _queues[i].front();
         const size_t bytes = std::min(front.length, free_bytes);
         std::memcpy(accum_buff.get(accum_buff.length), front.get(), bytes);
+        bytes_copied[i] += bytes;
         //std::cerr << "memcpy " << bytes << std::endl;
         accum_buff.length += bytes;
         free_bytes -= bytes;
diff --git a/lib/gras_impl/stats.hpp b/lib/gras_impl/stats.hpp
index 7edab29..a6d83ed 100644
--- a/lib/gras_impl/stats.hpp
+++ b/lib/gras_impl/stats.hpp
@@ -36,8 +36,14 @@ struct BlockStats
     std::vector<item_index_t> items_produced;
     std::vector<item_index_t> tags_produced;
     std::vector<item_index_t> msgs_produced;
+    std::vector<item_index_t> bytes_copied;
+
+    //port starvation tracking
+    std::vector<time_ticks_t> inputs_idle;
+    std::vector<time_ticks_t> outputs_idle;
 
     //instantaneous port status
+    size_t actor_queue_depth;
     std::vector<size_t> items_enqueued;
     std::vector<size_t> msgs_enqueued;
     std::vector<size_t> tags_enqueued;
diff --git a/lib/theron_allocator.cpp b/lib/theron_allocator.cpp
deleted file mode 100644
index 9db9367..0000000
--- a/lib/theron_allocator.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (C) by Josh Blum. See LICENSE.txt for licensing information.
-
-/***********************************************************************
- * There is allocation overhead for sending messages.
- * Want per worker-allocator for message allocation...
- * But until thats possible, install a new global allocator.
- * This allocator uses a fixed pool for small sized buffers,
- * and otherwise the regular malloc/free for larger buffers.
- **********************************************************************/
-
-#include <gras/gras.hpp>
-#include <gras_impl/debug.hpp>
-#include <Theron/Detail/Threading/SpinLock.h>
-#include <Theron/IAllocator.h>
-#include <Theron/AllocatorManager.h>
-#include <boost/circular_buffer.hpp>
-
-#define MY_ALLOCATOR_CHUNK_SIZE 256
-#define MY_ALLOCATOR_POOL_SIZE (MY_ALLOCATOR_CHUNK_SIZE * (1 << 18))
-
-static unsigned long long unwanted_malloc_count = 0;
-
-static struct ExitPrinter
-{
-    ExitPrinter(void){}
-    ~ExitPrinter(void)
-    {
-        if (unwanted_malloc_count)
-        {
-            VAR(unwanted_malloc_count);
-        }
-    }
-} exit_printer;
-
-static struct WorkerAllocator : Theron::IAllocator
-{
-    WorkerAllocator(void)
-    {
-        const size_t N = MY_ALLOCATOR_POOL_SIZE/MY_ALLOCATOR_CHUNK_SIZE;
-        queue.set_capacity(N);
-        for (size_t i = 0; i < N; i++)
-        {
-            const ptrdiff_t pool_ptr = ptrdiff_t(pool) + i*MY_ALLOCATOR_CHUNK_SIZE;
-            queue.push_back((void *)pool_ptr);
-        }
-        pool_end = ptrdiff_t(pool) + MY_ALLOCATOR_POOL_SIZE;
-        Theron::AllocatorManager::Instance().SetAllocator(this);
-    }
-
-    ~WorkerAllocator(void)
-    {
-        //NOP
-    }
-
-    void *Allocate(const SizeType size)
-    {
-        if GRAS_LIKELY(size <= MY_ALLOCATOR_CHUNK_SIZE)
-        {
-            mSpinLock.Lock();
-            if GRAS_UNLIKELY(queue.empty())
-            {
-                unwanted_malloc_count++;
-                mSpinLock.Unlock();
-                return std::malloc(size);
-            }
-            void *memory = queue.front();
-            queue.pop_front();
-            mSpinLock.Unlock();
-            return memory;
-        }
-        else
-        {
-            //std::cout << "malloc size " << size << std::endl;
-            return std::malloc(size);
-        }
-    }
-
-    void Free(void *const memory)
-    {
-        const bool in_pool = ptrdiff_t(memory) >= ptrdiff_t(pool) and ptrdiff_t(memory) < pool_end;
-        if GRAS_LIKELY(in_pool)
-        {
-            mSpinLock.Lock();
-            queue.push_front(memory);
-            mSpinLock.Unlock();
-        }
-        else
-        {
-            std::free(memory);
-        }
-    }
-
-    boost::circular_buffer<void *> queue;
-    THERON_PREALIGN(GRAS_MAX_ALIGNMENT)
-        char pool[MY_ALLOCATOR_POOL_SIZE]
-    THERON_POSTALIGN(GRAS_MAX_ALIGNMENT);
-    ptrdiff_t pool_end;
-    Theron::Detail::SpinLock mSpinLock;
-
-} my_alloc;
diff --git a/lib/top_block_query.cpp b/lib/top_block_query.cpp
index b000a2d..20e1c66 100644
--- a/lib/top_block_query.cpp
+++ b/lib/top_block_query.cpp
@@ -7,6 +7,7 @@
 #include <boost/property_tree/json_parser.hpp>
 #include <boost/property_tree/xml_parser.hpp>
 #include <boost/regex.hpp>
+#include <Theron/DefaultAllocator.h>
 #include <algorithm>
 #include <sstream>
 
@@ -87,6 +88,15 @@ static std::string query_stats(ElementImpl *self, const boost::property_tree::pt
     root.put("now", time_now());
     root.put("tps", time_tps());
 
+    //allocator debugs
+    Theron::DefaultAllocator *allocator = dynamic_cast<Theron::DefaultAllocator *>(Theron::AllocatorManager::Instance().GetAllocator());
+    if (allocator)
+    {
+        root.put("bytes_allocated", allocator->GetBytesAllocated());
+        root.put("peak_bytes_allocated", allocator->GetPeakBytesAllocated());
+        root.put("allocation_count", allocator->GetAllocationCount());
+    }
+
     //iterate through blocks
     boost::property_tree::ptree blocks;
     BOOST_FOREACH(const GetStatsMessage &message, receiver.messages)
@@ -105,6 +115,7 @@ static std::string query_stats(ElementImpl *self, const boost::property_tree::pt
         block.put("total_time_post", stats.total_time_post);
         block.put("total_time_input", stats.total_time_input);
         block.put("total_time_output", stats.total_time_output);
+        block.put("actor_queue_depth", stats.actor_queue_depth);
         #define my_block_ptree_append(l) { \
             boost::property_tree::ptree e; \
             for (size_t i = 0; i < stats.l.size(); i++) { \
@@ -122,6 +133,7 @@ static std::string query_stats(ElementImpl *self, const boost::property_tree::pt
         my_block_ptree_append(items_produced);
         my_block_ptree_append(tags_produced);
         my_block_ptree_append(msgs_produced);
+        my_block_ptree_append(bytes_copied);
         blocks.push_back(std::make_pair(message.block_id, block));
     }
     root.push_back(std::make_pair("blocks", blocks));
diff --git a/python/gras/query/CMakeLists.txt b/python/gras/query/CMakeLists.txt
index ab49532..a448068 100644
--- a/python/gras/query/CMakeLists.txt
+++ b/python/gras/query/CMakeLists.txt
@@ -20,6 +20,7 @@ INSTALL(
     chart_overall_throughput.js
     chart_handler_breakdown.js
     chart_total_io_counts.js
+    chart_allocator_counts.js
     main.css
     DESTINATION ${GR_PYTHON_DIR}/gras/query
     COMPONENT ${GRAS_COMP_PYTHON}
diff --git a/python/gras/query/chart_allocator_counts.js b/python/gras/query/chart_allocator_counts.js
new file mode 100644
index 0000000..5219672
--- /dev/null
+++ b/python/gras/query/chart_allocator_counts.js
@@ -0,0 +1,52 @@
+function GrasChartAllocatorCounts(args, panel)
+{
+    //input checking
+    if (args.block_ids.length != 0) throw gras_error_dialog(
+        "GrasChartAllocatorCounts",
+        "Error making allocator counts chart.\n"+
+        "Do not specify any blocks for this chart."
+    );
+
+    //settings
+    this.div = $('<div />').attr({class:'chart_total_io_counts'});
+    $(panel).append(this.div);
+    this.title = "Theron allocator counts"
+}
+
+GrasChartAllocatorCounts.prototype.update = function(point)
+{
+    var ul = $('<ul />');
+    $('ul', this.div).remove(); //clear old lists
+    this.div.append(ul);
+
+    function make_entry(strong, span)
+    {
+        var li = $('<li />');
+        var strong = $('<strong />').text(strong + ": ");
+        var span = $('<span />').text(span);
+        li.append(strong);
+        li.append(span);
+        ul.append(li);
+    }
+
+    var stuff = [
+        ['Allocated', 'bytes', 'bytes_allocated'],
+        ['Peak size', 'bytes', 'peak_bytes_allocated'],
+        ['Malloc\'d', 'times', 'allocation_count'],
+    ];
+
+    var entries = 0;
+    $.each(stuff, function(contents_i, contents)
+    {
+        var dir = contents[0];
+        var units = contents[1];
+        var key = contents[2];
+        var count = (key in point)? point[key] : 0;
+        if (count > 0)
+        {
+            make_entry(dir, count.toString() + ' ' + units);
+            entries++;
+        }
+    });
+    if (entries == 0) make_entry("Counts", "none");
+}
diff --git a/python/gras/query/chart_factory.js b/python/gras/query/chart_factory.js
index 122d222..dbb141a 100644
--- a/python/gras/query/chart_factory.js
+++ b/python/gras/query/chart_factory.js
@@ -13,6 +13,7 @@ var gras_chart_get_registry = function()
         {key:'overall_throughput', name:'Overall Throughput', factory:GrasChartOverallThroughput},
         {key:'handler_breakdown', name:'Handler Breakdown', factory:GrasChartHandlerBreakdown},
         {key:'total_io_counts', name:'I/O port Totals', factory:GrasChartTotalIoCounts},
+        {key:'allocator_counts', name:'Allocator Counts', factory:GrasChartAllocatorCounts},
     ];
 }
 
diff --git a/python/gras/query/chart_total_io_counts.js b/python/gras/query/chart_total_io_counts.js
index 2aa8a84..ac1fb0e 100644
--- a/python/gras/query/chart_total_io_counts.js
+++ b/python/gras/query/chart_total_io_counts.js
@@ -50,6 +50,7 @@ GrasChartTotalIoCounts.prototype.update = function(point)
         ['Output', 'items', 'items_produced'],
         ['Output', 'tags', 'tags_produced'],
         ['Output', 'msgs', 'msgs_produced'],
+        ['Copied', 'bytes', 'bytes_copied'],
     ];
 
     $.each(stuff, function(contents_i, contents)
@@ -65,4 +66,7 @@ GrasChartTotalIoCounts.prototype.update = function(point)
             }
         });
     });
+
+    var actor_depth = block_data.actor_queue_depth;
+    if (actor_depth > 1) make_entry('Actor depth', actor_depth.toString() + ' msgs');
 }
diff --git a/python/gras/query/main.html b/python/gras/query/main.html
index b64d53f..64d5809 100644
--- a/python/gras/query/main.html
+++ b/python/gras/query/main.html
@@ -15,6 +15,7 @@
     <script type="text/javascript" src="/chart_overall_throughput.js"></script>
     <script type="text/javascript" src="/chart_handler_breakdown.js"></script>
     <script type="text/javascript" src="/chart_total_io_counts.js"></script>
+    <script type="text/javascript" src="/chart_allocator_counts.js"></script>
     <script type="text/javascript" src="/main.js"></script>
     <script type="text/javascript">
         google.load('visualization', '1.0', {'packages':['corechart']});