<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Extract Text from PDF maps in Python Questions</title>
    <link>https://community.esri.com/t5/python-questions/extract-text-from-pdf-maps/m-p/1313878#M68307</link>
    <description>&lt;P&gt;You had me at re.&lt;/P&gt;&lt;P&gt;This does work the way I expected it too.&amp;nbsp; I too am not sure sure why a 7 year old version of pypdf2 is in the packages other than as a&amp;nbsp; dependency.&lt;/P&gt;&lt;P&gt;I was trying to avoid setting up a new environment for this, which would be needed to add pypdf.&amp;nbsp;&lt;/P&gt;&lt;P&gt;Anyhow thanks for the well crafted solution.&amp;nbsp;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Mon, 31 Jul 2023 22:52:55 GMT</pubDate>
    <dc:creator>DavidAnderson_1701</dc:creator>
    <dc:date>2023-07-31T22:52:55Z</dc:date>
    <item>
      <title>Extract Text from PDF maps</title>
      <link>https://community.esri.com/t5/python-questions/extract-text-from-pdf-maps/m-p/1313465#M68289</link>
      <description>&lt;P&gt;Hello,&lt;/P&gt;&lt;P&gt;I want to extract some of the text strings that are in a PDF map document this from an exported layout.&amp;nbsp; &amp;nbsp;For example I want to extract the string AZ-FTA-000574 that is stored in a pdf&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="DavidAnderson_1701_1-1690759242088.png" style="width: 400px;"&gt;&lt;img src="https://community.esri.com/t5/image/serverpage/image-id/76787iA2DF0D5454F54651/image-size/medium?v=v2&amp;amp;px=400" role="button" title="DavidAnderson_1701_1-1690759242088.png" alt="DavidAnderson_1701_1-1690759242088.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I tried the PyPDF2 that comes with ArcGIS Pro.&amp;nbsp; That returns the georeferencing information but no text.&amp;nbsp; The text is present as the PDF is searchable for labels as per this post.&lt;/P&gt;&lt;P&gt;&lt;A href="https://support.esri.com/en-us/knowledge-base/problem-unable-to-search-for-text-in-an-exported-pdf-fr-000027716" target="_blank" rel="noopener"&gt;https://support.esri.com/en-us/knowledge-base/problem-unable-to-search-for-text-in-an-exported-pdf-fr-000027716&lt;/A&gt;&lt;/P&gt;&lt;P&gt;The PyPDF2 included is version 1.26 which appears to be a circa 2016 package.&amp;nbsp; A bit out of date.&lt;/P&gt;&lt;P&gt;I'd like to do this with the out of the box tools shipped with Pro, rather than installing ReportLab or other Python PDF tools.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;DIV class=""&gt;&amp;nbsp;&lt;/DIV&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Sun, 30 Jul 2023 23:21:22 GMT</pubDate>
      <guid>https://community.esri.com/t5/python-questions/extract-text-from-pdf-maps/m-p/1313465#M68289</guid>
      <dc:creator>DavidAnderson_1701</dc:creator>
      <dc:date>2023-07-30T23:21:22Z</dc:date>
    </item>
    <item>
      <title>Re: Extract Text from PDF maps</title>
      <link>https://community.esri.com/t5/python-questions/extract-text-from-pdf-maps/m-p/1313466#M68290</link>
      <description>&lt;P&gt;This is a similar question to&amp;nbsp;&lt;A href="https://community.esri.com/t5/geoprocessing-questions/extracting-annotation-from-pdf-maps/m-p/1277537#M26819" target="_blank"&gt;https://community.esri.com/t5/geoprocessing-questions/extracting-annotation-from-pdf-maps/m-p/1277537#M26819&lt;/A&gt;&lt;/P&gt;&lt;P&gt;No answers there though.&lt;/P&gt;</description>
      <pubDate>Sun, 30 Jul 2023 23:23:22 GMT</pubDate>
      <guid>https://community.esri.com/t5/python-questions/extract-text-from-pdf-maps/m-p/1313466#M68290</guid>
      <dc:creator>DavidAnderson_1701</dc:creator>
      <dc:date>2023-07-30T23:23:22Z</dc:date>
    </item>
    <item>
      <title>Re: Extract Text from PDF maps</title>
      <link>https://community.esri.com/t5/python-questions/extract-text-from-pdf-maps/m-p/1313473#M68291</link>
      <description>&lt;P&gt;Do you have a sample pdf you can share?&lt;/P&gt;</description>
      <pubDate>Mon, 31 Jul 2023 01:05:48 GMT</pubDate>
      <guid>https://community.esri.com/t5/python-questions/extract-text-from-pdf-maps/m-p/1313473#M68291</guid>
      <dc:creator>Anonymous User</dc:creator>
      <dc:date>2023-07-31T01:05:48Z</dc:date>
    </item>
    <item>
      <title>Re: Extract Text from PDF maps</title>
      <link>https://community.esri.com/t5/python-questions/extract-text-from-pdf-maps/m-p/1313495#M68292</link>
      <description>&lt;P&gt;Here is a sample file.&lt;/P&gt;&lt;P&gt;IT is a two page PDF.&amp;nbsp; The first page is the one that has the information to be extracted.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;A href="https://ftp.wildfire.gov/public/incident_specific_data/southwest/GACC_Incidents/2023/2023_CottonwoodRidge/GIS/Maps/20230730/Pilot_and_Table_11x17_Land_20230729_2148_Cottonwood%20Ridge_AZFTA000555_0730day.pdf" target="_blank" rel="noopener"&gt;https://ftp.wildfire.gov/public/incident_specific_data/southwest/GACC_Incidents/2023/2023_CottonwoodRidge/GIS/Maps/20230730/Pilot_and_Table_11x17_Land_20230729_2148_Cottonwood%20Ridge_AZFTA000555_0730day.pdf&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Mon, 31 Jul 2023 03:49:56 GMT</pubDate>
      <guid>https://community.esri.com/t5/python-questions/extract-text-from-pdf-maps/m-p/1313495#M68292</guid>
      <dc:creator>DavidAnderson_1701</dc:creator>
      <dc:date>2023-07-31T03:49:56Z</dc:date>
    </item>
    <item>
      <title>Re: Extract Text from PDF maps</title>
      <link>https://community.esri.com/t5/python-questions/extract-text-from-pdf-maps/m-p/1313572#M68296</link>
      <description>&lt;P&gt;Looks like pypdf2 (&lt;SPAN class=""&gt;&lt;SPAN class=""&gt;&lt;STRONG&gt;PyPDF2 is deprecated since December 2022&lt;/STRONG&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;) fails to grab any text from that pdf, but the pypdf (same maintainer) package that is recommended (by the developer) to use gets it. Hard to say why pypdf2 is still in the base environment other than Dec 2022 is relatively recent and it is probably a dependency related install.&lt;/P&gt;&lt;P&gt;You can have the script install pypdf if it is not installed already...&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;import re

try:
    from pypdf import PdfReader
except Exception as ex:
    from subprocess import run
    import os
    import sys
    import json

    proc = run(["conda", "install", "pypdf", "-q", "-y", "--json"], text=True, capture_output=True)
    res = json.loads(proc.stdout)

    if res.get('stderr'):
        print(res['stderr'])  #
    else:
        from pypdf import PdfReader

# creating a pdf reader object
reader = PdfReader(
    r'C:\Users\...\Pilot_and_Table_11x17_Land_20230729_2148_Cottonwood Ridge_AZFTA000555_0730day.pdf')

# printing number of pages in pdf file
print(f'pages in pdf: {len(reader.pages)}')

# getting a specific page from the pdf file
page = reader.pages[0]

# extracting text from page
text = page.extract_text()

# use regex to get the string:
# r"(?:\w*-\w*-\d*)"
# Non-capturing group (?:\w*-\w*-\d*)
# \w matches any word character (equivalent to [a-zA-Z0-9_])
# * matches the previous token between zero and unlimited times, as many times as possible, giving back as needed (greedy)
# - matches the character - with index 4510 (2D16 or 558) literally (case sensitive)
# \w matches any word character (equivalent to [a-zA-Z0-9_])
# \d matches a digit (equivalent to [0-9])

comp = re.compile('(?:\w*-\w*-\d*)')
res = comp.search(text)

print(f'extracted: {res.group(0)}')&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;result:&lt;/P&gt;&lt;P&gt;pages in pdf: 2&lt;BR /&gt;extracted: AZ-FTA-000555&lt;/P&gt;&lt;P&gt;out from all the text that is returned:&lt;/P&gt;&lt;P&gt;2&lt;BR /&gt;51DIV A&lt;BR /&gt;DIV G&lt;BR /&gt;DIV DCopyright:© 2013 National Geographic Society, i-cubed&lt;BR /&gt;34°14.5'N 34°14'N 34°13.5'N 34°13'N 34°12.5'N 34°12'N 34°11.5'N 34°11'N 34°10.5'N 34°10'N 34°9.5'N 34°8.99'N 34°8.5'N 34°8'N 34°7.5'N34°14'N 34°13.5'N 34°13'N 34°12.5'N 34°12'N 34°11.5'N 34°11'N 34°10.5'N 34°10'N 34°9.5'N 34°8.99'N 34°8.5'N 34°8'N110°4'W 110°4.5'W 110°5'W 110°5.5'W 110°6.01'W 110°6.5'W 110°7'W 110°7.5'W 110°8'W 110°8.5'W 110°9'W 110°9.5'W 110°10'W 110°10.5'W 110°11'W 110°11.5'W 110°12'W 110°12.5'W 110°13'W 110°13.51'W 110°14'W 110°14.5'W 110°15'W&lt;BR /&gt;110°4'W 110°4.5'W 110°5'W 110°5.5'W 110°6.01'W 110°6.5'W 110°7'W 110°7.5'W 110°8'W 110°8.5'W 110°9'W 110°9.5'W 110°10'W 110°10.5'W 110°11'W 110°11.5'W 110°12'W 110°12.5'W 110°13'W 110°13.51'W 110°14'W 110°14.5'W 110°15'W/Helispot&lt;BR /&gt;Division Break&lt;BR /&gt;Wildfire Daily Fire Perimeter&lt;BR /&gt;Temporary Flight Restriction&lt;BR /&gt;Contained&lt;BR /&gt;Uncontained&lt;BR /&gt;7/29/2023 2249&lt;BR /&gt;Acres from IR and GPS Acres from&lt;BR /&gt;IR and GPS Acres from IR and GPS&lt;BR /&gt;Acres from IR and GPS346Cottonwood Ridge&lt;BR /&gt;&lt;STRONG&gt;AZ-FTA-000555&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;07/29/2023 dayPilot&lt;/STRONG&gt;&lt;BR /&gt;0 1 2&lt;BR /&gt;Miles&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Mon, 31 Jul 2023 12:44:26 GMT</pubDate>
      <guid>https://community.esri.com/t5/python-questions/extract-text-from-pdf-maps/m-p/1313572#M68296</guid>
      <dc:creator>Anonymous User</dc:creator>
      <dc:date>2023-07-31T12:44:26Z</dc:date>
    </item>
    <item>
      <title>Re: Extract Text from PDF maps</title>
      <link>https://community.esri.com/t5/python-questions/extract-text-from-pdf-maps/m-p/1313878#M68307</link>
      <description>&lt;P&gt;You had me at re.&lt;/P&gt;&lt;P&gt;This does work the way I expected it too.&amp;nbsp; I too am not sure sure why a 7 year old version of pypdf2 is in the packages other than as a&amp;nbsp; dependency.&lt;/P&gt;&lt;P&gt;I was trying to avoid setting up a new environment for this, which would be needed to add pypdf.&amp;nbsp;&lt;/P&gt;&lt;P&gt;Anyhow thanks for the well crafted solution.&amp;nbsp;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Mon, 31 Jul 2023 22:52:55 GMT</pubDate>
      <guid>https://community.esri.com/t5/python-questions/extract-text-from-pdf-maps/m-p/1313878#M68307</guid>
      <dc:creator>DavidAnderson_1701</dc:creator>
      <dc:date>2023-07-31T22:52:55Z</dc:date>
    </item>
  </channel>
</rss>

