Learnings

Monday, December 04, 2006

Nearest Neighbour with Oracle Spatial's SDO_NN

-- The website www.geonames.org has a downloadable database (by country) of all the places
-- in the world along with their latitude and longitude.
-- We are trying to extract some useful info from it.

create table GEONAMES (
GEONAMEID INTEGER,
NAME VARCHAR2(400),
ASCIINAME VARCHAR2(200),
ALTERNATENAMES VARCHAR2(1000),
LATITUDE NUMBER,
LONGITUDE NUMBER,
FEATURECLASS CHAR(1),
FEATURECODE VARCHAR2(10),
COUNTRYCODE CHAR(2),
CC2 VARCHAR2(60),
ADMIN1CODE VARCHAR2(20),
POPULATION NUMBER,
ELEVATION NUMBER,
GTOPO30 NUMBER,
TIMEZONE VARCHAR2(50),
MODIFICATIONDATE DATE,
LOCATION MDSYS.SDO_GEOMETRY)
;

create or replace directory data_dir as 'c:\IndianGeography\';

drop table geonames_ext;

create table GEONAMES_EXT (
GEONAMEID number,
NAME VARCHAR2(400),
ASCIINAME VARCHAR2(200),
ALTERNATENAMES VARCHAR2(1000),
LATITUDE NUMBER,
LONGITUDE NUMBER,
FEATURECLASS CHAR(1),
FEATURECODE VARCHAR2(10),
COUNTRYCODE CHAR(2),
CC2 VARCHAR2(60),
ADMIN1CODE VARCHAR2(20),
POPULATION number,
ELEVATION number,
GTOPO30 number,
TIMEZONE VARCHAR2(50),
MODIFICATIONDATE VARCHAR2(20)
)
ORGANIZATION EXTERNAL
(type oracle_loader
default directory data_dir
access parameters (
records delimited by newline
badfile 'tab2.bad'
logfile 'tab2.log'
fields terminated by ','
optionally enclosed by '"'
missing field values are null
)
location ('GeoNamesIN.csv')
)
;
select count(*) from geonames_ext;
INSERT INTO GEONAMES (
GEONAMEID,
NAME,
ASCIINAME,
ALTERNATENAMES,
LATITUDE,
LONGITUDE,
FEATURECLASS,
FEATURECODE,
COUNTRYCODE,
CC2,
ADMIN1CODE,
POPULATION,
ELEVATION,
GTOPO30,
TIMEZONE,
MODIFICATIONDATE)
SELECT
GEONAMEID,
NAME,
ASCIINAME,
ALTERNATENAMES,
LATITUDE,
LONGITUDE,
FEATURECLASS,
FEATURECODE,
COUNTRYCODE,
CC2,
ADMIN1CODE,
POPULATION,
ELEVATION,
GTOPO30,
TIMEZONE,
TO_DATE(MODIFICATIONDATE,'mm/dd/yyyy')
FROM
GEONAMES_EXT;
commit;

-- Update the location object based on latitude and longitude. Oracle asks for
-- Longitude and THEN Latitude, not the other way round:-(

UPDATE GEONAMES SET
LOCATION =
SDO_GEOMETRY
(2001, -- Geometry type: 2-D point
8307, -- SRID, Datum: WGS84
SDO_POINT_TYPE
(LONGITUDE,
LATITUDE,
NULL),
NULL,
NULL);
commit;

/*
SQL> CREATE INDEX GEOM_INDEX ON GEONAMES(LOCATION) INDEXTYPE IS MDSYS.SPATIAL_INDEX;
CREATE INDEX GEOM_INDEX ON GEONAMES(LOCATION) INDEXTYPE IS MDSYS.SPATIAL_INDEX
*
ERROR at line 1:
ORA-29855: error occurred in the execution of ODCIINDEXCREATE routine
ORA-13203: failed to read USER_SDO_GEOM_METADATA view
ORA-13203: failed to read USER_SDO_GEOM_METADATA view
ORA-06512: at "MDSYS.SDO_INDEX_METHOD_10I", line 10
ORA-06512: at line 1
*/
-- Looks like this insert is necessary to be able to create indexes, else it gives
-- the error above

INSERT INTO user_sdo_geom_metadata VALUES
('GEONAMES', -- Geometry Table
'LOCATION', -- Geometry Column
SDO_DIM_ARRAY (
SDO_DIM_ELEMENT ('LONGITUDE', -- Longitude Text
-180, -- Lower Boundary
180, -- Upper Boundary
0.5), -- Tolerance
SDO_DIM_ELEMENT ('LATITUDE', -- Latitude Text
-90, -- Lower Boundary
90, -- Upper Boundary
0.5) -- Tolerance
),
8307 -- (SRID) Datum:WGS84
);
COMMIT;

SQL> CREATE INDEX GEOM_INDEX ON GEONAMES(LOCATION) INDEXTYPE IS MDSYS.SPATIAL_INDEX;

-- Query for dist betn two points, say Bangalore and Delhi
SELECT SDO_GEOM.SDO_DISTANCE(a.LOCATION, b.LOCATION, 0.5, 'UNIT=KM')
FROM GEONAMES A,
GEONAMES B
WHERE
A.GEONAMEID = 1277331
AND B.GEONAMEID = 1257856

returns 1725 KM
-- Create pk on geonameid
ALTER TABLE GEONAMES ADD CONSTRAINT XPK_GEON PRIMARY KEY (GEONAMEID);

-- Create an index on feature code
create index INDEX_FEATCODE ON GEONAMES(FEATURECODE);
-- Find nearest railway stations to Bangalore
SELECT
A.NAME,
B.NAME,
B.FEATURECLASS,
B.FEATURECODE,
SDO_NN_DISTANCE(1) AS DISTANCE_IN_KM
FROM
GEONAMES A,
GEONAMES B
WHERE B.FEATURECODE IN
('RSTN') -- Railway stations
AND SDO_NN(A.LOCATION, B.LOCATION, 'SDO_BATCH_SIZE=2 UNIT=KM', 1) = 'TRUE'
AND A.GEONAMEID = 1277331
ORDER BY DISTANCE_IN_KM
/* Got this error!
ERROR at line 1:
ORA-13249: SDO_NN cannot be evaluated without using index
ORA-06512: at "MDSYS.MD", line 1723
ORA-06512: at "MDSYS.MDERR", line 17
ORA-06512: at "MDSYS.PRVT_IDX", line 44
*/
/* Looks like the A and B columns may have to be interchanged in the query */
SELECT
SUBSTR(A.ASCIINAME, 1, 30),
SUBSTR(B.ASCIINAME, 1, 30),
B.FEATURECLASS,
B.FEATURECODE,
SDO_NN_DISTANCE(1) AS DISTANCE_IN_KM
FROM
GEONAMES A,
GEONAMES B
WHERE B.FEATURECODE IN
('RSTN') -- Railway stations
AND SDO_NN(B.LOCATION, A.LOCATION, 'SDO_BATCH_SIZE=5 UNIT=KM', 1) = 'TRUE'
AND A.GEONAMEID = 1277331
AND ROWNUM < 10
ORDER BY DISTANCE_IN_KM ASC
-- The query runs but seems to be giving wrong results, it is NOT picking up the nearest
-- stations from Bangalore but giving stations which are more than 1000 km away!
/*
SUBSTR(A.ASCIINAME,1,30) SUBSTR(B.ASCIINAME,1,30) F FEATURECOD DISTANCE_IN_KM
------------------------------ ------------------------------ - ---------- --------------
Bangalore Urban Nawandgi S RSTN 466.853337
Bangalore Urban Nawabupalem S RSTN 605.457477
Bangalore Urban Nayagaon S RSTN 1009.34904
Bangalore Urban Nawapara Road S RSTN 1014.71092
Bangalore Urban Navli S RSTN 1159.73019
Bangalore Urban Navagadh S RSTN 1218.40504
Bangalore Urban Nazarbagh S RSTN 1300.14913
Bangalore Urban Naugarh S RSTN 1682.49445
Bangalore Urban Nawada S RSTN 1736.60645
*/
-- Changing it to use SDO_NUM_RES doesn't seem to work fully either. I ask for
-- 10 results and it gives only 1
SUBSTR(A.ASCIINAME,1,30) SUBSTR(B.ASCIINAME,1,30) F FEATURECOD DISTANCE_IN_KM
------------------------------ ------------------------------ - ---------- --------------
Bangalore Urban Malleswaram S RSTN 2.58237786

-- The following query works but is very inefficient because it finds all the nearest
-- neighbours and then discards all but 10
SELECT * FROM (
SELECT
SUBSTR(A.ASCIINAME, 1, 30),
SUBSTR(B.ASCIINAME, 1, 30),
B.FEATURECLASS,
B.FEATURECODE,
SDO_NN_DISTANCE(1) AS DISTANCE_IN_KM
FROM
GEONAMES A,
GEONAMES B
WHERE B.FEATURECODE IN
('RSTN') -- Railway stations
AND SDO_NN(B.LOCATION, A.LOCATION, 'SDO_BATCH_SIZE=10 UNIT=KM', 1) = 'TRUE'
AND A.GEONAMEID = 1277331
AND A.GEONAMEID <> B.GEONAMEID
ORDER BY DISTANCE_IN_KM ASC)
WHERE ROWNUM < 10

/*
Difference between SDO_BATCH_SIZE and SDO_NUM_RES.
--------------------------------------------------
It looks like SDO_NUM_RES blindly gives the number of nearest neighbours to a
given point WITHOUT APPLYING ANY PREDICATES! In other words, for the above example,
when I use SDO_NUM_RES=10, it first returns the 10 nearest points to Bangalore
(regardless of whether any of them are railway stations or not); the predicate
B.FEATURECODE IN ('RSTN') is applied on these 10 and, if any of them are railway
stations, they are returned, else the query returns no rows. This is the reason
why using SDO_NUM_RES=10 only returned one station ('Malleswaram') in the query given above.
The way to get around this problem is to use SDO_BATCH_SIZE along with the ROWNUM clause;
SDO_BATCH_SIZE will just keep on returning chunks of nearest neighbours. For instance,
SDO_BATCH_SIZE = 10 will return the 10 closest neighbours first, followed by the next 10
closest neighbours and so on until the time something (such as ROWNUM) stops the query.
So, if we use SDO_BATCH_SIZE = 500 and B.FEATURECODE IN ('RSTN') AND ROWNUM <= 10
(assuming that the 10 closest railway stations can be found within the 500 closest
neighbours of Bangalore), then the query should return in just one pass.
So, theoretically, this query should work and work fast (the reason we are using
an INDEX hint is because the docs say that the spatial index must always be hinted).
SELECT /*+ INDEX( B GEOM_INDEX) */
SUBSTR(A.ASCIINAME, 1, 30),
SUBSTR(B.ASCIINAME, 1, 30),
B.FEATURECLASS,
B.FEATURECODE,
SDO_NN_DISTANCE(1) AS DISTANCE_IN_KM
FROM
GEONAMES A,
GEONAMES B
WHERE B.FEATURECODE IN
('RSTN') -- Railway stations
AND SDO_NN(B.LOCATION, A.LOCATION, 'SDO_BATCH_SIZE=500 UNIT=KM', 1) = 'TRUE'
AND A.GEONAMEID = 1277331 -- Bangalore
AND A.GEONAMEID <> B.GEONAMEID
AND ROWNUM < 10
However, it gives wrong results:
---------------------------------------------------------------------------------------------------
SUBSTR(A.ASCIINAME,1,30) SUBSTR(B.ASCIINAME,1,30) F FEATURECOD DISTANCE_IN_KM
------------------------------ ------------------------------ - ---------- --------------
Bangalore Urban Nazarbagh S RSTN 1300.14913
Bangalore Urban Nayagaon S RSTN 1009.34904
Bangalore Urban Nawapara Road S RSTN 1014.71092
Bangalore Urban Nawandgi S RSTN 466.853337
Bangalore Urban Nawada S RSTN 1736.60645
Bangalore Urban Nawabupalem S RSTN 605.457477
Bangalore Urban Navli S RSTN 1159.73019
Bangalore Urban Navagadh S RSTN 1218.40504
Bangalore Urban Naugarh S RSTN 1682.49445
9 rows selected.
Elapsed: 00:00:09.72
Execution Plan
----------------------------------------------------------
0 SELECT STATEMENT Optimizer=ALL_ROWS (Cost=7 Card=2 Bytes=444
)
1 0 COUNT (STOPKEY)
2 1 NESTED LOOPS (Cost=7 Card=2 Bytes=444)
3 2 TABLE ACCESS (BY INDEX ROWID) OF 'GEONAMES' (TABLE) (C
ost=2 Card=1 Bytes=111)
4 3 INDEX (UNIQUE SCAN) OF 'XPK_GEON' (INDEX (UNIQUE)) (
Cost=1 Card=1)
5 2 TABLE ACCESS (BY INDEX ROWID) OF 'GEONAMES' (TABLE) (C
ost=7 Card=2 Bytes=222)
6 5 BITMAP CONVERSION (TO ROWIDS)
7 6 BITMAP AND
8 7 BITMAP CONVERSION (FROM ROWIDS)
9 8 INDEX (RANGE SCAN) OF 'INDEX_FEATCODE' (INDEX)
(Cost=0 Card=247)
10 7 BITMAP CONVERSION (FROM ROWIDS)
11 10 SORT (ORDER BY)
12 11 DOMAIN INDEX OF 'GEOM_INDEX' (INDEX (DOMAIN)
)




Statistics
----------------------------------------------------------
77367 recursive calls
1017480 db block gets
44711 consistent gets
0 physical reads
0 redo size
931 bytes sent via SQL*Net to client
508 bytes received via SQL*Net from client
2 SQL*Net roundtrips to/from client
3 sorts (memory)
0 sorts (disk)
9 rows processed

-------------------------------------------------------------------------------------------------
From the 10g docs:
"However, if there is an index associated with the column predicate in the WHERE clause,
be sure that this index is not used by specifying the NO_INDEX hint for that index. "
In our case, the autotrace output above shows that the bitmap on featurecode IS being used
and maybe what is causing the wrong results. So, as per the docs, let us now add a
NO_INDEX hint also and check:
SELECT /*+ INDEX( B GEOM_INDEX) NO_INDEX(B INDEX_FEATCODE) */
SUBSTR(A.ASCIINAME, 1, 30),
SUBSTR(B.ASCIINAME, 1, 30),
B.FEATURECLASS,
B.FEATURECODE,
SDO_NN_DISTANCE(1) AS DISTANCE_IN_KM
FROM
GEONAMES A,
GEONAMES B
WHERE B.FEATURECODE IN
('RSTN') -- Railway stations
AND SDO_NN(B.LOCATION, A.LOCATION, 'SDO_BATCH_SIZE=500 UNIT=KM', 1) = 'TRUE'
AND A.GEONAMEID = 1277331 -- Bangalore
AND A.GEONAMEID <> B.GEONAMEID
AND ROWNUM < 10
--------------------------------------------------------------------------------------------------
SUBSTR(A.ASCIINAME,1,30) SUBSTR(B.ASCIINAME,1,30) F FEATURECOD DISTANCE_IN_KM
------------------------------ ------------------------------ - ---------- --------------
Bangalore Urban Malleswaram S RSTN 2.58237786
Bangalore Urban Bettahalasur S RSTN 14.8611446
Bangalore Urban Rajankunti S RSTN 20.3627671
Bangalore Urban Dod Jala S RSTN 21.5323817
Bangalore Urban Hejjala S RSTN 25.828336
Bangalore Urban Devankundi S RSTN 27.1846953
Bangalore Urban Dodbele S RSTN 37.4330158
Bangalore Urban Oddarahalli S RSTN 39.0987432
Bangalore Urban Nidvanda S RSTN 41.5279711
9 rows selected.
Elapsed: 00:00:00.26
Execution Plan
----------------------------------------------------------
0 SELECT STATEMENT Optimizer=ALL_ROWS (Cost=75 Card=2 Bytes=44
4)
1 0 COUNT (STOPKEY)
2 1 NESTED LOOPS (Cost=75 Card=2 Bytes=444)
3 2 TABLE ACCESS (BY INDEX ROWID) OF 'GEONAMES' (TABLE) (C
ost=2 Card=1 Bytes=111)
4 3 INDEX (UNIQUE SCAN) OF 'XPK_GEON' (INDEX (UNIQUE)) (
Cost=1 Card=1)
5 2 TABLE ACCESS (BY INDEX ROWID) OF 'GEONAMES' (TABLE) (C
ost=75 Card=2 Bytes=222)
6 5 DOMAIN INDEX OF 'GEOM_INDEX' (INDEX (DOMAIN))



Statistics
----------------------------------------------------------
1242 recursive calls
14994 db block gets
2460 consistent gets
0 physical reads
0 redo size
940 bytes sent via SQL*Net to client
508 bytes received via SQL*Net from client
2 SQL*Net roundtrips to/from client
2 sorts (memory)
0 sorts (disk)
9 rows processed
--------------------------------------------------------------------------------------------------
THAT WORKS FINE AND THAT WORKS FAST! Here is an instance where the use of an index is
changing the results produced! (and it looks like this is documented behaviour too!)