JavaScript inner join and outer join
My background is in database administration and development. That means I often think about programming problems in a databasey way, like all programmers (I have met) who write SQL do so in a programmerish way. There are times when combining the two is useful though. I had two arrays from different sources and need to combine them on their common key. In the same way a database join would. The slight difference is that most database joins can take non equality operators (> and < for example) but in this instance I only need equality (or equi) joins. The obvious solution is a nested loop, something like this:
Loops
var newArray = [];
for(var i = 0, l = array1.length, m = array2.length; i < l; i++) {
for(var j = 0; j < m; m++) {
if (array1[i].myKey === array2[j].myKey) {
newArray.push({myKey: array1[i].myKey,
val1: array1[i].val,
val2: array2[j].val,
})
}
}
}
Indexed lookup
This is fine for small arrays, indeed, it's an approximation of how RDBMS will join for small tables. The problem is that it has to do a lot of work. The total array iterations is array1.length X array2.length.
A StackOverflow answer to this question suggested building an index of array1 keys and you only have to iterate through each array once. The JavaScript engine handles the lookup and it's much, much faster than in the interpreted JavaScript code.
The SO answer assumes a primary/foreign key relationship which meant is the second array includes values not in the first it fails. It doesn't handle composite keys. It is restricted to inner style joins. Finally, it uses an array for the index and I found Map() to be about 10% faster for my purposes.
Composite keys slow down the running of the function so it can take a string to represent the keys or an array. Use the string approach where possible. Also integer key values are faster than strings but you probably know that.
/**
* join two arrays together, like an inner join but only for equality rather than allowing complex operators
*
* The select function looks something like the following
* function (a, b) {
return {
id: a.id,
text: a.text,
name: b.name
};
* credit to http://stackoverflow.com/a/17500836/2561030 for the original idea.
* A single key integer column as a string is about 8 times faster (Node 4.4.5) than the array version. A single key string is about 1.5 times as fast.
* Adding more columns slows it more.
* Even the array keys version is about 15 times quicker than a nested loop.
* @param {Array} arr1 - first array to join
* @param {Array} arr2 - second array to join
* @param {Array or string} arr1key - first array's key column. String or array of strings'
* @param {Array or string} arr2key - second array's key column. String or array of strings
* @param {Function} select - return which columns are required
*/
function equijoin(arr1, arr2, arr1Key, arr2Key, select) {
var m = arr1.length, n = arr2.length, index = new Map(), c = [], row, rowKey, arr1Row;
function mapFunc(keyItem) { // Build the composite key
return row[keyItem];
}
if (Array.isArray(arr1Key) && Array.isArray(arr2Key)) {
arr1Key.sort();/// Allow the key kolumns to be entered in any order.
arr2Key.sort();
for (var i = 0; i < m; i++) { // loop through m items
row = arr1[i];
rowKey = arr1Key.map(mapFunc).join('~~'); /// combine the key values for lookup later
index.set(rowKey, row); // create an index for arr1 table
}
for (var j = 0; j < n; j++) { // loop through n items
row = arr2[j];
rowKey = arr2Key.map(mapFunc).join('~~');
if (rowKey !== undefined && rowKey !== '') { /// NULL !== NULL
arr1Row = index.get(rowKey); // get corresponding row from arr1
} else {
arr1Row = undefined;
}
if (arr1Row) {
c.push(select(arr1Row, row)); // select only the columns you need
}
}
} else {
for (var k = 0; k < m; k++) { // loop through m items
row = arr1[k];
index[row[arr1Key]] = row; // create an index for arr1 table
}
for (var l = 0; l < n; l++) { // loop through n items
row = arr2[l];
if (row[arr2Key] !== null) {
arr1Row = index[row[arr2Key]]; // get corresponding row from arr1
} else {
arr1Row = undefined;
}
if (arr1Row) {
c.push(select(arr1Row, row)); // select only the columns you need
}
}
}
return c;
}
And to use it:
var foo = equijoin(arr1, arr2, ['id', 'a'], ['a', 'id'], function(a, b) {
return {id: a.id, a: a.a, pri: a.value, fore: b.value};
});
var foo1 = equijoin(arr1, arr2, 'id', 'id', function(a, b) {
return {id: a.id, a: a.a, pri: a.value, fore: b.value};
});
Outer join
OK, inner style joins are all well and good. What happens if one of the arrays doesn't contain all of the rows on the other? An outer join if you will. A small modification can give us left and right joins.
function rightjoin(arr1, arr2, arr1Key, arr2Key, select) {
var m = arr1.length, n = arr2.length, index = new Map(), c = [], row, rowKey, arr1Row;
function mapFunc(keyItem) { // Build the composite key
return row[keyItem];
}
if (Array.isArray(arr1Key) && Array.isArray(arr2Key)) {
arr1Key.sort(); /// Allow the key kolumns to be entered in any order.
arr2Key.sort();
for (var i = 0; i < m; i++) { // loop through m items
row = arr1[i];
rowKey = arr1Key.map(mapFunc).join('~~'); /// combine the key values for lookup later
index.set(rowKey, row); // create an index for arr1 table
}
for (var j = 0; j < n; j++) { // loop through n items
row = arr2[j];
rowKey = arr2Key.map(mapFunc).join('~~');
if (rowKey !== undefined && rowKey !== '') { /// NULL !== NULL
arr1Row = index.get(rowKey); // get corresponding row from arr1
} else {
arr1Row = {};
}
if (!arr1Row) {
arr1Row = {};
}
c.push(select(arr1Row, row)); // select only the columns you need
}
} else {
for (var k = 0; k < m; k++) { // loop through m items
row = arr1[k];
index.set(row[arr1Key], row);
// create an index for arr1 table
}
for (var l = 0; l < n; l++) { // loop through n items
row = arr2[l];
if (row[arr2Key] !== null && row[arr2Key] !== undefined) { /// NULL !== NULL
arr1Row = index.get(row[arr2Key]); // get corresponding row from arr1
} else {
arr1Row = {};
}
if (!arr1Row) {
arr1Row = {};
}
c.push(select(arr1Row, row)); // select only the columns you need
}
}
return c;
}
Limitations
There is one big disadvantage of this approach over nested loops. If the keys in the first column aren't unique then only one instance will be matched.
Big arrays use a lot of memory and become slow. Over 100,000 rows becomes impractical.
As mentioned above, I have used Map() for the index. That won't work in IE. A version using arrays is below.
Testing
I use Node.js and Express to test each approach. The code below builds two arrays with a bit of random data. It then joins them using the functions above.
var crypto = require('crypto');
function random(howMany, chars) {
chars = chars || "abcdefghijklmnopqrstuwxyz0123456789";
var rnd = crypto.randomBytes(howMany),
value = new Array(howMany),
len = chars.length,
i;
for (i = 0; i < howMany; i++) {
value[i] = chars[rnd[i] % len];
}
return value.join('');
}
router.get('/abc', function(req, res, next) {
var arr1 = [], arr2 = [], a;
for (var i = 0; i < 10000; i++) {
a = 'abc';
if (i % 5 === 0) {
a = 0;
}
arr1.push({id: i, a: 'abc', value: random(10)});
}
for (var j = 0; j < 10000; j++) {
a = 'abc';
if (j % 2 === 0) {
if (j % 6 === 0) {
a = 0;
}
arr2.push({id: j, a: a, value: random(10)});
}
}
var d1 = new Date();
var foo = equijoin(arr1, arr2, ['id', 'a'], ['a', 'id'], function(primary, foreign) {
return {id: primary.id, a: primary.a, pri: primary.value, fore: foreign.value};
});
var timer = Date.now() - d1;
d1 = new Date();
var foo1 = equijoin(arr1, arr2, 'id', 'id', function(primary, foreign) {
return {id: primary.id, a: primary.a, pri: primary.value, fore: foreign.value};
});
var timer1 = Date.now() - d1;
d1 = new Date();
var foo2 = equijoin(arr1, arr2, ['id', 'a'], ['a', 'id'], function(primary, foreign) {
return {id: primary.id, a: primary.a, pri: primary.value, fore: foreign.value};
});
var timer2 = Date.now() - d1;
d1 = new Date();
var foo3 = leftjoin(arr1, arr2, ['id', 'a'], ['a', 'id'], function (a, b) {
return {id: a.id, a: a.a, pri: a.value, fore: b.value};
});
var timer3 = Date.now() - d1;
d1 = new Date();
var foo4 = leftjoin(arr1, arr2, 'id', 'id', function (a, b) {
return {id: a.id, a: a.a, pri: a.value, fore: b.value};
});
var timer4 = Date.now() - d1;
res.send({time: timer, foo: foo.length,
time1: timer1, foo1: foo1.length,
time2: timer2, foo2l: foo2.length,
time3: timer3, foo3l: foo3.length,
time4: timer4, foo4l: foo4.length});
});
And finally
As promised, here is a version which uses arrays rather than Map() for the index. It's a little slower but should work in IE. I haven't tested it.
function equijoin2(arr1, arr2, arr1Key, arr2Key, select) {
var m = arr1.length, n = arr2.length, index = [], c = [], row, rowKey, arr1Row;
function mapFunc(keyItem) { // Build the composite key
return row[keyItem];
}
if (Array.isArray(arr1Key) && Array.isArray(arr2Key)) {
arr1Key.sort();/// Allow the key kolumns to be entered in any order.
arr2Key.sort();
for (var i = 0; i < m; i++) { // loop through m items
row = arr1[i];
rowKey = arr1Key.map(mapFunc).join('~~'); /// combine the key values for lookup later.
index[rowKey] = row; // create an index for arr1 table
}
for (var j = 0; j < n; j++) { // loop through n items
row = arr2[j];
rowKey = arr2Key.map(mapFunc).join('~~');
arr1Row = index[rowKey]; // get corresponding row from arr1
if (arr1Row) {
c.push(select(arr1Row, row)); // select only the columns you need
}
}
} else {
for (var k = 0; k < m; k++) { // loop through m items
row = arr1[k];
index[row[arr1Key]] = row; // create an index for arr1 table
}
for (var l = 0; l < n; l++) { // loop through n items
row = arr2[l];
arr1Row = index[row[arr2Key]]; // get corresponding row from arr1
if (arr1Row) {
c.push(select(arr1Row, row)); // select only the columns you need
}
}
}
return c;
}
function rightjoin2(arr1, arr2, arr1Key, arr2Key, select) {
var m = arr1.length, n = arr2.length, index = [], c = [], row, rowKey, arr1Row;
function mapFunc(keyItem) { // Build the composite key
return row[keyItem];
}
if (Array.isArray(arr1Key) && Array.isArray(arr2Key)) {
arr1Key.sort(); /// Allow the key kolumns to be entered in any order.
arr2Key.sort();
for (var i = 0; i < m; i++) { // loop through m items
row = arr1[i];
rowKey = arr1Key.map(mapFunc).join('~~'); /// combine the key values for lookup later
index[rowKey] = row; // create an index for arr1 table
}
for (var j = 0; j < n; j++) { // loop through n items
row = arr2[j];
rowKey = arr2Key.map(mapFunc).join('~~');
arr1Row = index[rowKey]; // get corresponding row from arr1
if (!arr1Row) {
arr1Row = {};
}
c.push(select(arr1Row, row)); // select only the columns you need
//c[c.length] = select(arr1Row, row)
}
} else {
for (var k = 0; k < m; k++) { // loop through m items
row = arr1[k];
index[row[arr1Key]] = row; // create an index for arr1 table
}
for (var l = 0; l < n; l++) { // loop through n items
row = arr2[l];
arr1Row = index[row[arr2Key]]; // get corresponding row from arr1
if (!arr1Row) {
arr1Row = {};
}
c.push(select(arr1Row, row)); // select only the columns you need
}
}
return c;
}