-- ======================================================================
--
-- Fulltext Search (FTS) mit PostgreSQL 9
--
-- ======================================================================

-- Some initial commands to learn FTS functions:
-----------------------------------------------

SELECT 'Our first string used today'::tsvector;
"'Our' 'first' 'string' 'today' 'used'"

SELECT 'Our first string used today first string'::tsvector;
"'Our' 'first' 'string' 'today' 'used'"
--> still the same terms

SELECT plainto_tsquery('simple','the one');
--> Get preprocessed text 

SELECT to_tsvector('simple','Our first string used today first string')
UNION
SELECT to_tsvector('german','Our first string used today first string')
UNION 
SELECT to_tsvector('english','Our first string used today first string');

SELECT to_tsquery('simple', 'word');

SELECT to_tsvector('simple', 'Andy');

SELECT to_tsvector('rosa una rosa una rosa');

SELECT 'Peter Piper picked a peck of pickled peppers'::tsvector @@ to_tsquery('peter & pip') AS "Result";

SELECT to_tsvector('Peter Piper picked a peck of pickled peppers') @@ to_tsquery('peter & pip') AS "Result";

SELECT to_tsvector('Peter Piper picked a peck of pickled peppers, Piper') 


--- Let's define a table
------------------------

CREATE TABLE collection (
  id SERIAL PRIMARY KEY, 
  file VARCHAR(255) UNIQUE,
  content TEXT NULL
);
-- DROP TABLE collection CASCADE;


-- Reading an external file
----------------------------
-- Create a folder in the PostgreSQL cluster
-- 1. Determine the location of your cluster by running as super user:
SELECT name, setting FROM pg_settings WHERE name='data_directory'; 

-- 2. Create a function to load a doc
-- DROP FUNCTION get_text_document(CHARACTER VARYING);
CREATE OR REPLACE FUNCTION get_text_document(p_filename CHARACTER VARYING)
  RETURNS TEXT AS $$
  -- Set the end read to some big number because we are too lazy to grab the length
  -- and it will cut of at the EOF anyway
  SELECT CAST(pg_read_file(E'text/' || $1 ,0, 100000000) AS TEXT);
$$ LANGUAGE sql VOLATILE SECURITY DEFINER;
ALTER FUNCTION get_text_document(CHARACTER VARYING) OWNER TO postgres;

-- 3. Now you can copy the file into the temp dir 
-- (HINT: File must be UTF-8) and test call the below
SELECT get_text_document('file1.txt'); 

-- insert doc
-- (HINT: File must be UTF-8) and test call the below
INSERT INTO collection(file, content)
  VALUES ('file1.txt', get_text_document('file1.txt'));
SELECT * FROM collection;
INSERT INTO collection(file, content)
  VALUES ('file2.txt', get_text_document('file2.txt'));
SELECT * FROM collection;
UPDATE collection 
  SET file='file1.txt', content=get_text_document('file1.txt') where id=11;

-- Loads all files from a directory into table 'collection'
-- returns no. of rows (=files) inserted
CREATE OR REPLACE FUNCTION load_collection(TEXT)
  RETURNS INT AS $$
  DECLARE 
    afile TEXT;
    acontent TEXT;
    counter INT :=0; 
  BEGIN 
    FOR afile IN SELECT pg_ls_dir($1) LOOP
      SELECT CAST(pg_read_file($1 || afile ,0, 100000000) AS TEXT) INTO acontent;
      INSERT INTO collection(file, content)
        VALUES (afile, acontent);
      counter := counter + 1;
    END LOOP;
    RETURN counter;
    EXCEPTION
    WHEN others THEN
      -- RAISE EXCEPTION 'an error';
      RETURN -1;
  END;
$$ LANGUAGE plpgsql;
-- DROP FUNCTION load_collection(TEXT);

-- Load the collection (from Soekia)
SELECT load_collection('text/ozon/');
SELECT id,file,substring(content from 1 for 30)||'...' as "content" 
   FROM collection ORDER BY file;

-- Look at the tuples
SELECT * FROM collection;
SELECT id,file,substring(content from 1 for 30)||'...' AS "content" 
   FROM collection ORDER BY file;
SELECT id,file,substring(content from 1 for 30)||'...' AS "content", substring(to_tsvector(content)::text from 1 for 30) AS "tsvector"
  FROM collection ORDER BY file;


--- New types ts_vector and ts_query

SELECT file, to_tsvector(content) AS "tsvector" FROM collection ORDER BY file;
SELECT file, to_tsvector(content) @@ to_tsquery('Nerds') AS "result" 
  FROM collection ORDER BY file;
SELECT file, to_tsvector(content) @@ to_tsquery('Nerds | Geeks') AS "result" 
  FROM collection ORDER BY file;

SELECT count(*) FROM collection WHERE
  to_tsvector(content) @@ to_tsquery('Nerds');
SELECT count(*) FROM collection WHERE 
  content ~* 'nerds';


--- CREATE INDEX 

CREATE OR REPLACE FUNCTION to_tsvector2( TEXT )
  RETURNS TSVECTOR AS
$$
SELECT to_tsvector( $1 );
$$ LANGUAGE 'sql' IMMUTABLE;

-- DROP INDEX fts_collection;
-- Hint: functions in index expression must be marked IMMUTABLE
CREATE INDEX fts_collection ON collection 
  USING GIST( to_tsvector2( COALESCE(file,' ')||' '||COALESCE(content,' ')) );

SELECT count(*) FROM collection WHERE
  to_tsvector(content) @@ to_tsquery('Nerds');
SELECT count(*) FROM collection WHERE 
  content ~* 'nerds';

ANALYZE
EXPLAIN
SELECT file, to_tsvector(content) @@ to_tsquery('Nerds | Geeks') AS "result" 
  FROM collection ORDER BY file;


--- Display contents again...
-----------------------------

SELECT id,file,substring(content from 1 for 30)||'...' as "content" 
  FROM collection ORDER BY file;

SELECT file,substring(content from 1 for 30)||'...' as "content" FROM collection WHERE 
  to_tsvector(content) @@ to_tsquery('Weltretter & mit & Hornbrillen') ORDER BY file;

SELECT file, ts_headline(content, to_tsquery('Ozon')), ts_rank(to_tsvector(content), to_tsquery('Ozon')) FROM collection;


-- DROP FUNCTION fts_query(text, text);
CREATE OR REPLACE FUNCTION fts_query(content text, query text)
  RETURNS BOOLEAN AS
$$
  SELECT to_tsvector($1) @@ to_tsquery($2);
$$ LANGUAGE 'sql';


SELECT file,fts_query(content, 'Ozon') FROM collection;
SELECT file,substring(content from 1 for 30)||'...' as "content" FROM collection WHERE 
  fts_query(content, 'Ozon') ORDER BY file;
SELECT file,regexp_replace(to_tsvector('simple',content)::text, '(\n|\r)', 'NL') FROM collection WHERE 
  fts_query(content, 'Ozon') ORDER BY file;
SELECT file,fts_query(content, 'hohem') FROM collection;

  
SELECT file,fts_query(content, 'Weltretter & mit & Hornbrillen') FROM collection;
SELECT file,fts_query(content, 'Nerd') FROM collection;


-- What's about regex?
----------------------

SELECT file, content ~* 'ozon' AS "matched" FROM collection ORDER BY file;

CREATE OR REPLACE FUNCTION regex_query(content text, query text)
  RETURNS BOOLEAN AS
$$
  SELECT $1 ~* $2;
$$ LANGUAGE 'sql';

SELECT file, regex_query(content,'ozon') AS "matched" FROM collection ORDER BY file;

-- Phrases
----------

-- Suche nach 'Weltretter mit Hornbrillen' geht nicht mit fts, nur das
SELECT file, to_tsvector(content) @@ to_tsquery('Weltretter & mit & Hornbrillen') AS "result" 
  FROM collection ORDER BY file;

-- Mit Regex geht das:
SELECT count(*) FROM collection WHERE 
  content ~* 'Weltretter mit Hornbrillen';
SELECT * FROM collection WHERE 
  to_tsvector(content) @@ to_tsquery('Weltretter & mit & Hornbrillen');

-- Beschleunigt:
EXPLAIN 
SELECT id, file FROM collection WHERE 
  content ~* 'Weltretter mit Hornbrillen'
  AND to_tsvector(content) @@ to_tsquery('Weltretter & mit & Hornbrillen');


-- Some System Tables
---------------------

SELECT * FROM pg_ts_config; -- 
SELECT * FROM pg_ts_dict; -- Thesaurus

SELECT * FROM pg_ts_config_map; --
SELECT * FROM pg_ts_parser; -- ?
SELECT * FROM pg_ts_template; -- ?

SELECT * FROM pg_stat_activity;
SELECT procpid, current_query FROM pg_stat_activity;




-- ==================
-- The tsearch2 Guide
-- ==================

/* 
Brandon Craig Rhodes,  30 June 2003
Updated to 8.2 release by Oleg Bartunov, October 2006
From: http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/docs/tsearch2-guide.html

SELECT * from pg_ts_config;
SELECT set_config('simple');
SELECT set_config('german');
SELECT set_ts_config('english');
vacuum full analyze
select ts_lexize('simple', '100');

*/

SELECT get_current_ts_config();
SELECT * FROM ts_debug('english', 'Paris');

-- creating vectors / reducing documents to vectors
SELECT to_tsvector('The air smells of sea water.') -- with word positions in vector
SELECT strip(to_tsvector('The air smells of sea water.')); -- without word positions 

-- build an indexed table of vectors
CREATE TABLE vectors ( vector tsvector );
CREATE INDEX vector_index ON vectors USING gist(vector);
INSERT INTO vectors VALUES (to_tsvector('The path forks here'));
INSERT INTO vectors VALUES (to_tsvector('A crawl leads west'));
INSERT INTO vectors VALUES (to_tsvector('The left fork leads northeast'));

SELECT * FROM vectors;

-- find documents with either forks or leads, but without crawl: (forks|leads) & !crawl
-- Note the query converts leads|forks to lead, fork:
SELECT to_tsquery('(leads|forks) & !crawl');
-- function can not accept a space separated string:
SELECT to_tsquery('this is many words');
SELECT to_tsquery('english', 'a|is&not|!the');

-- Now, do the query against vectors:
SELECT * FROM vectors WHERE vector @@ to_tsquery('(leads|forks) & !crawl');

-- Some Ranking!
SELECT ts_rank_cd (to_tsvector('english','in the list of stop words'), to_tsquery('list & stop'));

***

-- Solution Ex 2. Question 1:
-- Search for net and cyberspace, write out how
-- many files (not times!) the data consists the words and how long it took to search them.

-- Search.
SELECT file, to_tsvector(content) @@ to_tsquery('net & cyberspace') AS "result" 
  FROM collection ORDER BY file;
