See The Spark blogs

JavaScript inner join and outer join

My background is in database administration and development. That means I often think about programming problems in a databasey way, like all programmers (I have met) who write SQL do so in a programmerish way. There are times when combining the two is useful though. I had two arrays from different sources and need to combine them on their common key. In the same way a database join would. The slight difference is that most database joins can take non equality operators (> and < for example) but in this instance I only need equality (or equi) joins. The obvious solution is a nested loop, something like this:

Loops

var newArray = [];
for(var i = 0, l = array1.length, m = array2.length; i < l; i++) {
  for(var j = 0; j < m; m++) {
    if (array1[i].myKey === array2[j].myKey) {
      newArray.push({myKey: array1[i].myKey, 
        val1: array1[i].val, 
        val2: array2[j].val, 
      })
    }
  }
}

Indexed lookup

This is fine for small arrays, indeed, it's an approximation of how RDBMS will join for small tables. The problem is that it has to do a lot of work. The total array iterations is array1.length X array2.length. A StackOverflow answer to this question suggested building an index of array1 keys and you only have to iterate through each array once. The JavaScript engine handles the lookup and it's much, much faster than in the interpreted JavaScript code.
The SO answer assumes a primary/foreign key relationship which meant is the second array includes values not in the first it fails. It doesn't handle composite keys. It is restricted to inner style joins. Finally, it uses an array for the index and I found Map() to be about 10% faster for my purposes. Composite keys slow down the running of the function so it can take a string to represent the keys or an array. Use the string approach where possible. Also integer key values are faster than strings but you probably know that.

/** 
 * join two arrays together, like an inner join but only for equality rather than allowing complex operators
 * 
 * The select function looks something like the following
 * function (a, b) {
    return {
        id: a.id,
        text: a.text,
        name: b.name
    };
  * credit to http://stackoverflow.com/a/17500836/2561030  for the original idea.
  * A single key integer column as a string is about 8 times faster (Node 4.4.5) than the array version.  A single key string is about 1.5 times as fast.
  * Adding more columns slows it more.
  * Even the array keys version is about 15 times quicker than a nested loop.
  * @param {Array} arr1 - first array to join
  * @param {Array} arr2 - second array to join
  * @param {Array or string} arr1key - first array's key column.  String or array of strings'
  * @param {Array or string} arr2key - second array's key column.  String or array of strings
  * @param {Function} select - return which columns are required
 */

  function equijoin(arr1, arr2, arr1Key, arr2Key, select) {
    var m = arr1.length, n = arr2.length, index = new Map(), c = [], row, rowKey, arr1Row;

    function mapFunc(keyItem) { // Build the composite key
      return row[keyItem];
    }

    if (Array.isArray(arr1Key) && Array.isArray(arr2Key)) {
      arr1Key.sort();/// Allow the key kolumns to be entered in any order.
      arr2Key.sort();
      for (var i = 0; i < m; i++) {     // loop through m items
          row = arr1[i];
          rowKey = arr1Key.map(mapFunc).join('~~'); /// combine the key values for lookup later
          index.set(rowKey, row); // create an index for arr1 table
      }

      for (var j = 0; j < n; j++) {     // loop through n items
          row = arr2[j];
          rowKey = arr2Key.map(mapFunc).join('~~');
          if (rowKey !== undefined && rowKey !== '') { /// NULL !== NULL
             arr1Row = index.get(rowKey); // get corresponding row from arr1
           } else {
            arr1Row = undefined;
           }

          if (arr1Row) {
             c.push(select(arr1Row, row));         // select only the columns you need
          }
      }
    } else {
      for (var k = 0; k < m; k++) {     // loop through m items
        row = arr1[k];
        index[row[arr1Key]] = row; // create an index for arr1 table
      }

      for (var l = 0; l < n; l++) {     // loop through n items
        row = arr2[l];
        if (row[arr2Key] !== null) {
          arr1Row = index[row[arr2Key]]; // get corresponding row from arr1
        } else {
          arr1Row = undefined;
        }

        if (arr1Row) {
          c.push(select(arr1Row, row));         // select only the columns you need
        }
      }
    }

    return c;
  }

And to use it:

  var foo = equijoin(arr1, arr2, ['id', 'a'], ['a', 'id'], function(a, b) {
    return {id: a.id, a: a.a, pri: a.value, fore: b.value};
  });

  var foo1 = equijoin(arr1, arr2, 'id', 'id', function(a, b) {
    return {id: a.id, a: a.a, pri: a.value, fore: b.value};
  });

Outer join

OK, inner style joins are all well and good. What happens if one of the arrays doesn't contain all of the rows on the other? An outer join if you will. A small modification can give us left and right joins.

  function rightjoin(arr1, arr2, arr1Key, arr2Key, select) {
    var m = arr1.length, n = arr2.length, index = new Map(), c = [], row, rowKey, arr1Row;

    function mapFunc(keyItem) {       // Build the composite key
      return row[keyItem];
    }

    if (Array.isArray(arr1Key) && Array.isArray(arr2Key)) {
      arr1Key.sort();                  /// Allow the key kolumns to be entered in any order.
      arr2Key.sort();
      for (var i = 0; i < m; i++) {     // loop through m items
          row = arr1[i];
          rowKey = arr1Key.map(mapFunc).join('~~'); /// combine the key values for lookup later
          index.set(rowKey, row);            // create an index for arr1 table
      }

      for (var j = 0; j < n; j++) {     // loop through n items
          row = arr2[j];
          rowKey = arr2Key.map(mapFunc).join('~~');

          if (rowKey !== undefined && rowKey !== '') { /// NULL !== NULL
            arr1Row = index.get(rowKey);      // get corresponding row from arr1
          } else {
            arr1Row = {};
          }

          if (!arr1Row) {
            arr1Row = {};
          }
          c.push(select(arr1Row, row));  // select only the columns you need

      }
    } else {
      for (var k = 0; k < m; k++) {     // loop through m items
        row = arr1[k];
        index.set(row[arr1Key], row);

              // create an index for arr1 table
      }

      for (var l = 0; l < n; l++) {     // loop through n items
        row = arr2[l];
        if (row[arr2Key] !== null && row[arr2Key] !== undefined) { /// NULL !== NULL
          arr1Row = index.get(row[arr2Key]); // get corresponding row from arr1
        } else {
          arr1Row = {};
        }

        if (!arr1Row) {
          arr1Row = {};
        }
        c.push(select(arr1Row, row));   // select only the columns you need
      }
    }

    return c;
  }

Limitations

There is one big disadvantage of this approach over nested loops. If the keys in the first column aren't unique then only one instance will be matched.
Big arrays use a lot of memory and become slow. Over 100,000 rows becomes impractical. As mentioned above, I have used Map() for the index. That won't work in IE. A version using arrays is below.

Testing

I use Node.js and Express to test each approach. The code below builds two arrays with a bit of random data. It then joins them using the functions above.

var crypto = require('crypto');

function random(howMany, chars) {
  chars = chars || "abcdefghijklmnopqrstuwxyz0123456789";
  var rnd = crypto.randomBytes(howMany),
    value = new Array(howMany),
    len = chars.length,
    i;

  for (i = 0; i < howMany; i++) {
    value[i] = chars[rnd[i] % len];
  }

  return value.join('');
}

router.get('/abc', function(req, res, next) {
  var arr1 = [], arr2 = [], a;

  for (var i = 0; i < 10000; i++) {
    a = 'abc';
    if (i % 5 === 0) {
      a = 0;
    }
    arr1.push({id: i, a: 'abc', value: random(10)});
  }
  for (var j = 0; j < 10000; j++) {
    a = 'abc';
    if (j % 2 === 0) {
      if (j % 6 === 0) {
        a = 0;
      }
      arr2.push({id:  j, a: a, value: random(10)});
    }
  }

  var d1 = new Date();
  var foo = equijoin(arr1, arr2, ['id', 'a'], ['a', 'id'], function(primary, foreign) {
    return {id: primary.id, a: primary.a, pri: primary.value, fore: foreign.value};
  });
  var timer = Date.now() - d1;
  d1 = new Date();
  var foo1 = equijoin(arr1, arr2, 'id', 'id', function(primary, foreign) {
    return {id: primary.id, a: primary.a, pri: primary.value, fore: foreign.value};
  });

  var timer1 = Date.now() - d1;
  d1 = new Date();

  var foo2 = equijoin(arr1, arr2, ['id', 'a'], ['a', 'id'], function(primary, foreign) {
    return {id: primary.id, a: primary.a, pri: primary.value, fore: foreign.value};
  });
  var timer2 = Date.now() - d1;

  d1 = new Date();

  var foo3 = leftjoin(arr1, arr2,  ['id', 'a'], ['a', 'id'], function (a, b) {
    return {id: a.id, a: a.a, pri: a.value, fore: b.value};
  });
  var timer3 = Date.now() - d1; 
  d1 = new Date();

   var foo4 = leftjoin(arr1, arr2,  'id', 'id', function (a, b) {
    return {id: a.id, a: a.a, pri: a.value, fore: b.value};
  });
  var timer4 = Date.now() - d1;

  res.send({time: timer, foo: foo.length, 
    time1: timer1, foo1: foo1.length, 
    time2: timer2, foo2l: foo2.length,
    time3: timer3, foo3l: foo3.length,
    time4: timer4, foo4l: foo4.length});

});

And finally

As promised, here is a version which uses arrays rather than Map() for the index. It's a little slower but should work in IE. I haven't tested it.

  function equijoin2(arr1, arr2, arr1Key, arr2Key, select) {
    var m = arr1.length, n = arr2.length, index = [], c = [], row, rowKey, arr1Row;

    function mapFunc(keyItem) { // Build the composite key
      return row[keyItem];
    }

    if (Array.isArray(arr1Key) && Array.isArray(arr2Key)) {
      arr1Key.sort();/// Allow the key kolumns to be entered in any order.
      arr2Key.sort();
      for (var i = 0; i < m; i++) {     // loop through m items
          row = arr1[i];
          rowKey = arr1Key.map(mapFunc).join('~~'); /// combine the key values for lookup later.
          index[rowKey] = row; // create an index for arr1 table
      }

      for (var j = 0; j < n; j++) {     // loop through n items
          row = arr2[j];
          rowKey = arr2Key.map(mapFunc).join('~~');
          arr1Row = index[rowKey]; // get corresponding row from arr1

          if (arr1Row) {
             c.push(select(arr1Row, row));         // select only the columns you need
          }

      }
    } else {
      for (var k = 0; k < m; k++) {     // loop through m items
        row = arr1[k];
        index[row[arr1Key]] = row; // create an index for arr1 table
      }

      for (var l = 0; l < n; l++) {     // loop through n items
        row = arr2[l];
        arr1Row = index[row[arr2Key]]; // get corresponding row from arr1
        if (arr1Row) {
          c.push(select(arr1Row, row));         // select only the columns you need
        }
      }
    }

    return c;
  }

  function rightjoin2(arr1, arr2, arr1Key, arr2Key, select) {
    var m = arr1.length, n = arr2.length, index = [], c = [], row, rowKey, arr1Row;

    function mapFunc(keyItem) {       // Build the composite key
      return row[keyItem];
    }

    if (Array.isArray(arr1Key) && Array.isArray(arr2Key)) {
      arr1Key.sort();                  /// Allow the key kolumns to be entered in any order.
      arr2Key.sort();
      for (var i = 0; i < m; i++) {     // loop through m items
          row = arr1[i];
          rowKey = arr1Key.map(mapFunc).join('~~'); /// combine the key values for lookup later
          index[rowKey] = row;            // create an index for arr1 table
      }

      for (var j = 0; j < n; j++) {     // loop through n items
          row = arr2[j];
          rowKey = arr2Key.map(mapFunc).join('~~');
          arr1Row = index[rowKey];      // get corresponding row from arr1

          if (!arr1Row) {
            arr1Row = {};
          }
          c.push(select(arr1Row, row));  // select only the columns you need
          //c[c.length] = select(arr1Row, row)

      }
    } else {
      for (var k = 0; k < m; k++) {     // loop through m items
        row = arr1[k];
        index[row[arr1Key]] = row;      // create an index for arr1 table
      }

      for (var l = 0; l < n; l++) {     // loop through n items
        row = arr2[l];
        arr1Row = index[row[arr2Key]]; // get corresponding row from arr1
        if (!arr1Row) {
          arr1Row = {};
        }
        c.push(select(arr1Row, row));   // select only the columns you need
      }
    }

    return c;
  }
Date published: 2016-12-03
Last updated: 2016-12-03
Add a comment You must be logged in and have confirmed your email to comment. Sign in