<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic What is the optimized way to execute spatial queries in Hive, using GIS Tools for Hadoop? in ArcGIS GeoAnalytics Server Questions</title>
    <link>https://community.esri.com/t5/arcgis-geoanalytics-server-questions/what-is-the-optimized-way-to-execute-spatial/m-p/1295623#M197</link>
    <description>&lt;P&gt;&lt;SPAN&gt;Dear Esri Team,&lt;/SPAN&gt;&lt;/P&gt;&lt;DIV&gt;I am trying to implement spatial queries in Hive using GIS Tools for Hadoop, following the steps mentioned in the link below.&lt;DIV&gt;&amp;nbsp;&lt;DIV&gt;&lt;A href="https://github.com/Esri/gis-tools-for-hadoop/tree/master/samples/point-in-polygon-aggregation-hive" target="_blank" rel="noopener"&gt;https://github.com/Esri/gis-tools-for-hadoop/tree/master/samples/point-in-polygon-aggregation-hive&amp;nbsp;&lt;/A&gt;&lt;DIV&gt;&amp;nbsp;&lt;DIV&gt;The implementation of the queries are as follows&lt;DIV&gt;&amp;nbsp;&lt;DIV&gt;&lt;STRONG&gt;&lt;STRONG&gt;1.&lt;SPAN&gt;&lt;STRONG&gt;Circle range query&lt;BR /&gt;&lt;/STRONG&gt;&lt;/SPAN&gt;&lt;/STRONG&gt;&lt;/STRONG&gt;SELECT COUNT(*) FROM table_name WHERE&lt;BR /&gt;ST_Intersects(ST_Buffer(ST_Point(x1,y1), radius), ST_Point(x, y));&lt;STRONG&gt;&lt;STRONG&gt;&lt;SPAN&gt;&lt;STRONG&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;STRONG&gt;2. BBOX range query&lt;BR /&gt;&lt;/STRONG&gt;&lt;/STRONG&gt;&lt;/SPAN&gt;&lt;/STRONG&gt;&lt;/STRONG&gt;SELECT COUNT(*) FROM table_name WHERE&lt;BR /&gt;ST_Contains(ST_GeomFromText('POLYGON ((x1 y1, x2 y2, .. , x1 y1))'), ST_Point(x, y));&lt;STRONG&gt;&lt;STRONG&gt;&lt;SPAN&gt;&lt;STRONG&gt;&lt;STRONG&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;STRONG&gt;3. k-NN query (k=10)&lt;BR /&gt;&lt;/STRONG&gt;&lt;/STRONG&gt;&lt;/STRONG&gt;&lt;/SPAN&gt;&lt;/STRONG&gt;&lt;/STRONG&gt;SELECT ST_Distance(ST_Point(x1, y1), ST_Point(x, y)) as distance, x, y FROM&lt;STRONG&gt;&lt;STRONG&gt;&lt;SPAN&gt;&lt;STRONG&gt;&lt;STRONG&gt;&lt;STRONG&gt;&lt;BR /&gt;&lt;/STRONG&gt;&lt;/STRONG&gt;&lt;/STRONG&gt;&lt;/SPAN&gt;&lt;/STRONG&gt;&lt;/STRONG&gt;table_name ORDER BY distance ASC LIMIT 10;&lt;DIV&gt;&amp;nbsp;&lt;DIV&gt;Kindly let me know the optimized way to implement the above queries (especially k-NN).&lt;DIV&gt;&amp;nbsp;&lt;DIV&gt;Thanks &amp;amp; Regards&lt;DIV&gt;Vigneswaran&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;</description>
    <pubDate>Sun, 04 Jun 2023 11:38:52 GMT</pubDate>
    <dc:creator>vigneshwaran</dc:creator>
    <dc:date>2023-06-04T11:38:52Z</dc:date>
    <item>
      <title>What is the optimized way to execute spatial queries in Hive, using GIS Tools for Hadoop?</title>
      <link>https://community.esri.com/t5/arcgis-geoanalytics-server-questions/what-is-the-optimized-way-to-execute-spatial/m-p/1295623#M197</link>
      <description>&lt;P&gt;&lt;SPAN&gt;Dear Esri Team,&lt;/SPAN&gt;&lt;/P&gt;&lt;DIV&gt;I am trying to implement spatial queries in Hive using GIS Tools for Hadoop, following the steps mentioned in the link below.&lt;DIV&gt;&amp;nbsp;&lt;DIV&gt;&lt;A href="https://github.com/Esri/gis-tools-for-hadoop/tree/master/samples/point-in-polygon-aggregation-hive" target="_blank" rel="noopener"&gt;https://github.com/Esri/gis-tools-for-hadoop/tree/master/samples/point-in-polygon-aggregation-hive&amp;nbsp;&lt;/A&gt;&lt;DIV&gt;&amp;nbsp;&lt;DIV&gt;The implementation of the queries are as follows&lt;DIV&gt;&amp;nbsp;&lt;DIV&gt;&lt;STRONG&gt;&lt;STRONG&gt;1.&lt;SPAN&gt;&lt;STRONG&gt;Circle range query&lt;BR /&gt;&lt;/STRONG&gt;&lt;/SPAN&gt;&lt;/STRONG&gt;&lt;/STRONG&gt;SELECT COUNT(*) FROM table_name WHERE&lt;BR /&gt;ST_Intersects(ST_Buffer(ST_Point(x1,y1), radius), ST_Point(x, y));&lt;STRONG&gt;&lt;STRONG&gt;&lt;SPAN&gt;&lt;STRONG&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;STRONG&gt;2. BBOX range query&lt;BR /&gt;&lt;/STRONG&gt;&lt;/STRONG&gt;&lt;/SPAN&gt;&lt;/STRONG&gt;&lt;/STRONG&gt;SELECT COUNT(*) FROM table_name WHERE&lt;BR /&gt;ST_Contains(ST_GeomFromText('POLYGON ((x1 y1, x2 y2, .. , x1 y1))'), ST_Point(x, y));&lt;STRONG&gt;&lt;STRONG&gt;&lt;SPAN&gt;&lt;STRONG&gt;&lt;STRONG&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;STRONG&gt;3. k-NN query (k=10)&lt;BR /&gt;&lt;/STRONG&gt;&lt;/STRONG&gt;&lt;/STRONG&gt;&lt;/SPAN&gt;&lt;/STRONG&gt;&lt;/STRONG&gt;SELECT ST_Distance(ST_Point(x1, y1), ST_Point(x, y)) as distance, x, y FROM&lt;STRONG&gt;&lt;STRONG&gt;&lt;SPAN&gt;&lt;STRONG&gt;&lt;STRONG&gt;&lt;STRONG&gt;&lt;BR /&gt;&lt;/STRONG&gt;&lt;/STRONG&gt;&lt;/STRONG&gt;&lt;/SPAN&gt;&lt;/STRONG&gt;&lt;/STRONG&gt;table_name ORDER BY distance ASC LIMIT 10;&lt;DIV&gt;&amp;nbsp;&lt;DIV&gt;Kindly let me know the optimized way to implement the above queries (especially k-NN).&lt;DIV&gt;&amp;nbsp;&lt;DIV&gt;Thanks &amp;amp; Regards&lt;DIV&gt;Vigneswaran&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;</description>
      <pubDate>Sun, 04 Jun 2023 11:38:52 GMT</pubDate>
      <guid>https://community.esri.com/t5/arcgis-geoanalytics-server-questions/what-is-the-optimized-way-to-execute-spatial/m-p/1295623#M197</guid>
      <dc:creator>vigneshwaran</dc:creator>
      <dc:date>2023-06-04T11:38:52Z</dc:date>
    </item>
    <item>
      <title>Re: What is the optimized way to execute spatial queries in Hive, using GIS Tools for Hadoop?</title>
      <link>https://community.esri.com/t5/arcgis-geoanalytics-server-questions/what-is-the-optimized-way-to-execute-spatial/m-p/1650155#M199</link>
      <description>&lt;P&gt;Hi &lt;a href="https://community.esri.com/t5/user/viewprofilepage/user-id/113344"&gt;@vigneshwaran&lt;/a&gt;,&lt;BR /&gt;Hive can do these, but you’ll get much better performance with a &lt;STRONG&gt;two-stage filter&lt;/STRONG&gt; (cheap spatial prefilter → exact test) and a &lt;STRONG&gt;coarse spatial index&lt;/STRONG&gt; (grid/geohash/H3) to avoid full scans especially for k-NN.&lt;/P&gt;&lt;P&gt;Below are practical patterns that work well with &lt;STRONG&gt;GIS Tools for Hadoop&lt;/STRONG&gt; UDFs.&lt;/P&gt;&lt;H2&gt;&lt;FONT size="4"&gt;0) Table prep (one-time)&lt;/FONT&gt;&lt;/H2&gt;&lt;UL&gt;&lt;LI&gt;&lt;P&gt;Store as &lt;STRONG&gt;ORC/Parquet&lt;/STRONG&gt;, enable &lt;STRONG&gt;Tez&lt;/STRONG&gt;, &lt;STRONG&gt;vectorization&lt;/STRONG&gt;, and collect &lt;STRONG&gt;stats&lt;/STRONG&gt;.&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P&gt;Add a &lt;STRONG&gt;point geometry&lt;/STRONG&gt; column once, rather than building it per query.&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P&gt;Add a &lt;STRONG&gt;coarse spatial key&lt;/STRONG&gt; (e.g., geohash/H3/quadkey) for pruning.&lt;/P&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;LI-CODE lang="sql"&gt;-- Example schema
CREATE TABLE pts (
  id BIGINT,
  x  DOUBLE,         -- lon
  y  DOUBLE,         -- lat
  geom STRING,       -- WKT point
  gh  STRING         -- geohash / tile id
)
STORED AS ORC;

-- Populate (example)
INSERT OVERWRITE TABLE pts
SELECT id, x, y,
       ST_AsText(ST_Point(x, y))                   AS geom,
       geohash(x, y, 7)                            AS gh       -- use your geohash/H3 UDF
FROM raw_pts;

ANALYZE TABLE pts COMPUTE STATISTICS;&lt;/LI-CODE&gt;&lt;P&gt;If you don’t have a geohash/H3 UDF handy, precompute a simple grid key (e.g., floor(x/Δ), floor(y/Δ)).&lt;/P&gt;&lt;P&gt;1) Circle range query (fast pattern)&lt;/P&gt;&lt;P&gt;Avoid buffering every row. Compute the circle once, prefilter by envelope, then run exact check.&lt;/P&gt;&lt;LI-CODE lang="sql"&gt;-- Params
WITH q AS (
  SELECT ST_Point(%x1, %y1)                 AS qpt,
         %radius                            AS r
)
SELECT COUNT(*)
FROM pts
JOIN q
WHERE
  -- cheap prefilter: envelope test
  ST_EnvIntersects(
     ST_Envelope(ST_Buffer(q.qpt, q.r)),    -- computed once
     ST_Envelope(ST_Point(x, y))
  )
  -- exact test
  AND ST_Distance(ST_Point(x, y), q.qpt) &amp;lt;= q.r;&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Why faster? The envelope test prunes most rows before the expensive exact distance.&lt;/P&gt;&lt;P&gt;2) BBOX range query (fast pattern)&lt;/P&gt;&lt;P&gt;Use ST_EnvIntersects first, then (optionally) ST_Contains as exact test.&lt;/P&gt;&lt;LI-CODE lang="sql"&gt;WITH bbox AS (
  SELECT ST_GeomFromText('POLYGON((x1 y1, x2 y2, x3 y3, x4 y4, x1 y1))') AS poly
)
SELECT COUNT(*)
FROM pts
JOIN bbox
WHERE ST_EnvIntersects(bbox.poly, ST_Point(x, y))
  AND ST_Contains(bbox.poly, ST_Point(x, y));   -- optional exact check&lt;/LI-CODE&gt;&lt;P&gt;3) k-NN (k=10): scalable approach&lt;/P&gt;&lt;P&gt;Global ORDER BY distance LIMIT 10 forces a single reducer (slow). Instead, do candidate pruning with your spatial key, then a small sort.&lt;/P&gt;&lt;P&gt;3a) Precompute neighbors to search&lt;/P&gt;&lt;P&gt;For geohash/H3, get the cell containing (x1,y1) and its immediate neighbors (ring). If &amp;lt; k results, expand by another ring.&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;LI-CODE lang="sql"&gt;-- Example: find target cell + 8 neighbors
WITH q AS (
  SELECT %x1 AS qx, %y1 AS qy, geohash(%x1, %y1, 7) AS qgh
),
nbrs AS (
  SELECT q.qx, q.qy, n.gh AS ngh
  FROM q
  LATERAL VIEW explode(geohash_neighbors(q.qgh)) n AS gh   -- UDF returns center+neighbors
)
SELECT  distance, id, x, y
FROM (
  SELECT
    ST_Distance(ST_Point(x, y), ST_Point(qx, qy)) AS distance,
    id, x, y
  FROM pts
  JOIN nbrs ON pts.gh = nbrs.ngh
) cand
ORDER BY distance ASC
LIMIT 10;&lt;/LI-CODE&gt;&lt;P&gt;If your 9 cells don’t yield 10 points, expand to the next ring (second-order neighbors) and rerun. In production, do this iteratively in a procedure, or over-fetch (e.g., 2 rings) once and still be fast.&lt;/P&gt;&lt;P&gt;3b) No geohash? Use a bounding box window as a candidate filter&lt;/P&gt;&lt;P&gt;Start with a small window around (x1,y1), enlarge if needed.&lt;/P&gt;&lt;LI-CODE lang="sql"&gt;WITH q AS (
  SELECT %x1 AS qx, %y1 AS qy, %win AS w
)
SELECT distance, id, x, y
FROM (
  SELECT ST_Distance(ST_Point(x, y), ST_Point(qx, qy)) AS distance, id, x, y
  FROM pts JOIN q
  WHERE x BETWEEN qx - w AND qx + w
    AND y BETWEEN qy - w AND qy + w
) cand
ORDER BY distance
LIMIT 10;&lt;/LI-CODE&gt;&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;Pick w based on expected density; if &amp;lt; 10 results, double w and retry.&lt;/P&gt;&lt;P&gt;Extra performance tips&lt;/P&gt;&lt;P&gt;Cache constants: put ST_Buffer(...), ST_Point(x1,y1) in a CTE so they’re computed once.&lt;/P&gt;&lt;P&gt;Partition/Bucket on your spatial key (gh) to get partition pruning.&lt;/P&gt;&lt;P&gt;Use SORT BY distance LIMIT 10 only if you first reduce candidates; otherwise it still hits a single reducer.&lt;/P&gt;&lt;P&gt;Consider converting hot datasets to Parquet with sorted Z-order (lon,lat) or by spatial key to improve locality.&lt;/P&gt;&lt;P&gt;If polygons are small and numerous, pre-tile them (one row per tile) and join on tile id before exact ST_Intersects.&lt;/P&gt;&lt;P&gt;Regards,&lt;BR /&gt;Venkat&lt;/P&gt;</description>
      <pubDate>Fri, 12 Sep 2025 18:36:18 GMT</pubDate>
      <guid>https://community.esri.com/t5/arcgis-geoanalytics-server-questions/what-is-the-optimized-way-to-execute-spatial/m-p/1650155#M199</guid>
      <dc:creator>VenkataKondepati</dc:creator>
      <dc:date>2025-09-12T18:36:18Z</dc:date>
    </item>
  </channel>
</rss>

