Full-Text Search Battle: PostgreSQL vs Elasticsearch

2020-09-08 update: Use one GIN index instead of two, websearch_to_tsquery, add LIMIT, and store TSVECTOR as separate column. More details at the end of the article.

I started investigating full-text search options recently. The use-case is real-time search over key-value pairs where the keys are strings and the values are either strings, numbers, or dates. It should enable full-text search over keys and values and range queries over numbers and dates with 1.5 million unique key-value pairs as the expected maximum search index size. The search company Algolia recommends an end-to-end latency budget of no more than 50ms, so we'll use that as our threshold.

PostgreSQL

I'm already familiar with PostgreSQL, so let's see if it meets these requirements. First, create a table,


CREATE TABLE IF NOT EXISTS search_idx
(
    id       BIGINT PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY,
    key_str  TEXT NOT NULL,
    val_str  TEXT NOT NULL,
    val_int  INT,
    val_date TIMESTAMPTZ
);

Next, seed it with semi-realistic data. The following can insert ~20,000 rows/second into tables that don't have indexes,


const pgp = require("pg-promise")();
const faker = require("faker");
const Iterations = 150;
const seedDb = async () => {
  const db = pgp({
    database: process.env.DATABASE_NAME,
    user: process.env.DATABASE_USER,
    password: process.env.DATABASE_PASSWORD,
    max: 30,
  });
  const columns = new pgp.helpers.ColumnSet(
    ["key_str", "val_str", "val_int", "val_date"],
    { table: "search_idx" },
  );
  const getNextData = (_, pageIdx) =>
    Promise.resolve(
      pageIdx > Iterations - 1
        ? null
        : Array.from(Array(10000)).map(() => ({
            key_str: `${faker.lorem.word()} ${faker.lorem.word()}`,
            val_str: faker.lorem.words(),
            val_int: Math.floor(faker.random.float()),
            val_date: faker.date.past(),
          })),
    );
  console.debug(
    await db.tx("seed-db", (t) =>
      t.sequence((idx) =>
        getNextData(t, idx).then((data) => {
          if (data) return t.none(pgp.helpers.insert(data, columns));
        }),
      ),
    ),
  );
};
seedDb();

Now that the 1.5M rows are loaded, add the full-text search GIN indexes (details),


CREATE INDEX search_idx_key_str_idx ON search_idx
    USING GIN (to_tsvector('english'::regconfig, key_str));
CREATE INDEX search_idx_val_str_idx ON search_idx
    USING GIN (to_tsvector('english'::regconfig, val_str));

Note: If adding more data to the table after creating the indexes, VACUUM ANALYZE search_idx; to update the table stats and improve query plans.

Time to test performance over the following queries,


-- Prefix query across FTS columns
SELECT *
FROM search_idx
WHERE to_tsvector('english'::regconfig, key_str) @@ to_tsquery('english'::regconfig, 'qui:*')
  OR to_tsvector('english'::regconfig, val_str) @@ to_tsquery('english'::regconfig, 'qui:*');
-- Wildcard query on key (not supported by GIN index)
SELECT *
FROM search_idx
WHERE key_str ILIKE '%quis%';
-- Specific key and value(s) query
SELECT *
FROM search_idx
WHERE to_tsvector('english'::regconfig, key_str) @@ to_tsquery('english'::regconfig, 'quis')
  AND (to_tsvector('english'::regconfig, val_str) @@ to_tsquery('english'::regconfig, 'nulla')
    OR (to_tsvector('english'::regconfig, val_str) @@ to_tsquery('english'::regconfig, 'velit')));
-- Contrived range query, one field wouldn't have both val_int and val_date populated
SELECT *
FROM search_idx
WHERE to_tsvector('english'::regconfig, key_str) @@ to_tsquery('english'::regconfig, 'quis')
  AND val_int > 1000
  AND val_date > '2020-01-01';

Add EXPLAIN in front of any query to ensure it's using the index rather than doing a full table scan. Adding EXPLAIN ANALYZE in front will give timing information, however, as noted in the documentation, this adds overhead and can sometimes take significantly longer than executing the query normally.

I consistently get ~100ms on my MacBook Pro (2.4 GHz 8-Core i9, 32 GB RAM) on the first, likely most common query. This is well over the 50ms threshold.

Elasticsearch

Next, let's try Elasticsearch. Start it and ensure the status is green with the following,


docker run -d -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch:7.9.0
curl -sX GET "localhost:9200/_cat/health?v&pretty" -H "Accept: application/json"

Next, seed the index. The first script creates a file with each line a semi-realistic document,


const faker = require("faker");
const { writeFileSync } = require("fs");
const Iterations = 1_500_000;
writeFileSync(
  "./dataset.ndjson",
  Array.from(Array(Iterations))
    .map(() =>
      JSON.stringify({
        key: `${faker.lorem.word()} ${faker.lorem.word()}`,
        val: faker.lorem.words(),
        valInt: Math.floor(faker.random.float()),
        valDate: faker.date.past(),
      }),
    )
    .join("\n"),
);

The second script adds each document from the ~150 MB file to the index,


const { createReadStream } = require("fs");
const split = require("split2");
const { Client } = require("@elastic/elasticsearch");
const Index = "search-idx";
const seedIndex = async () => {
  const client = new Client({ node: "http://localhost:9200" });
  console.debug(
    await client.helpers.bulk({
      datasource: createReadStream("./dataset.ndjson").pipe(split()),
      onDocument(doc) {
        return { index: { _index: Index } };
      },
      onDrop(doc) {
        b.abort();
      },
    }),
  );
};
seedIndex();

Note: this script is fine for testing purposes, but in production, follow best practices on sizing bulk requests and using multiple threads.

Now, run some queries,


curl -sX GET "localhost:9200/search-idx/_search?pretty" \
-H 'Content-Type: application/json' \
-d'
{
  "query": {
    "simple_query_string" : {
      "query": "\"repellat sunt\" -quis",
      "fields": ["key", "val"],
      "default_operator": "and"
    }
  }
}
'

I get 5-24ms on 136 matching results clustered on the low end the more you run the query. This is ~5x faster than PostgreSQL. So it seems like the added overhead of maintaining an Elasticsearch cluster may be worth it to get the performance I'm after.

2020-09-08 update

With @samokhvalov's help, I created a single GIN index instead of using two,


CREATE INDEX search_idx_key_str_idx ON search_idx
    USING GIN ((setweight(to_tsvector('english'::regconfig, key_str), 'A') ||
                setweight(to_tsvector('english'::regconfig, val_str), 'B')));

I'm using PostgreSQL 10 in production, however, since I'm using the latest version of Elasticsearch, it's only fair to pull the latest version of PostgreSQL (postgres:12.4-alpine at the time of writing). I then updated the query to use the new index, websearch_to_tsquery, and add the same default LIMIT used by Elasticsearch,


SELECT *
FROM search_idx
WHERE (setweight(to_tsvector('english'::regconfig, key_str), 'A') ||
       setweight(to_tsvector('english'::regconfig, val_str), 'B')) @@
      websearch_to_tsquery('english'::regconfig, '"repellat sunt" -quis')
LIMIT 10000;

This is much more of an apples to apples comparison and drastically cut the query times on my MacBook Pro, from ~100ms to ~13-16ms on 172 matching results!

As a final test, I created a standalone column to hold the TSVECTOR as described in the documentation. It's kept up-to-date via trigger,


CREATE TABLE IF NOT EXISTS search_idx
(
    id       BIGINT PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY,
    key_str  TEXT NOT NULL,
    val_str  TEXT NOT NULL,
    val_int  INT,
    val_date TIMESTAMPTZ,
    fts      TSVECTOR
);
CREATE FUNCTION fts_trigger() RETURNS trigger AS
$$
BEGIN
    new.fts :=
      setweight(to_tsvector('pg_catalog.english', new.key_str), 'A') ||
      setweight(to_tsvector('pg_catalog.english', new.val_str), 'B');
    return new;
END
$$ LANGUAGE plpgsql;
CREATE TRIGGER tgr_search_idx_fts_update
    BEFORE INSERT OR UPDATE
    ON search_idx
    FOR EACH ROW
EXECUTE FUNCTION fts_trigger();
CREATE INDEX search_idx_fts_idx ON search_idx USING GIN (fts);
SELECT *
FROM search_idx
WHERE fts @@ websearch_to_tsquery('english'::regconfig, '"repellat sunt" -quis')
LIMIT 10000;

This gets queries down to 6-10ms when measured in psql, practically the same as Elasticsearch for this specific query.

If anyone knows of further optimizations, I'll update my rough numbers and the code snippets to include them.

PostgreSQL

Elasticsearch

2020-09-08 update

Stay up to date