Determine if a PDF is searchable

Content extraction
5/8/2012

Downloads

This code sample shows how to use Page.CreateShapes to determine if a PDF document has searchable text; this means that the text is saved in the PDF as glyphs and not, for example, as an image.

C# code sample

1 static void Main(string[] args) 2 { 3 String filename = @"..\..\..\inputdocuments/redaction.pdf"; 4 using (FileStream file = new FileStream(string.Format(filename), FileMode.Open, FileAccess.Read)) 5 { 6 Document document = new Document(file); 7 8 bool searchable = false; 9 foreach (Page page in document.Pages) 10 { 11 ShapeCollection shapes = page.CreateShapes(); 12 searchable = hasText(shapes); 13 if (searchable) break; 14 } 15 Console.WriteLine("\"{0}\" {1} text.", filename, searchable ? "has" : "does not have"); 16 } 17 } 18 19 static bool hasText(ShapeCollection shapes) 20 { 21 foreach (Shape shape in shapes) 22 { 23 if (shape is TextShape) 24 return true; 25 26 if (shape is ShapeCollection) 27 { 28 ShapeCollection innerShapes = shape as ShapeCollection; 29 if (hasText(innerShapes)) 30 return true; 31 } 32 } 33 return false; 34 }

VB.NET code sample

1 Sub Main() 2 Dim filename As [String] = "..\..\..\inputdocuments/redaction.pdf" 3 Using file As New FileStream(String.Format(filename), FileMode.Open, FileAccess.Read) 4 Dim document As New Document(file) 5 6 Dim searchable As Boolean = False 7 For Each page As Page In document.Pages 8 Dim shapes As ShapeCollection = page.CreateShapes() 9 searchable = hasText(shapes) 10 If searchable Then 11 Exit For 12 End If 13 Next 14 Console.WriteLine("""{0}"" {1} text.", filename, If(searchable, "has", "does not have")) 15 End Using 16 End Sub 17 18 Private Function hasText(shapes As ShapeCollection) As Boolean 19 For Each shape As Shape In shapes 20 If TypeOf shape Is TextShape Then 21 Return True 22 End If 23 24 If TypeOf shape Is ShapeCollection Then 25 Dim innerShapes As ShapeCollection = TryCast(shape, ShapeCollection) 26 If hasText(innerShapes) Then 27 Return True 28 End If 29 End If 30 Next 31 Return False 32 End Function