Extract graphics from PDF

Content extraction
7/25/2014

This c# code sample shows how to extract text, images and curves as shapes from a PDF document.

extract-graphics-from-pdf.PNG

Shapes

The Shape class is an abstract class with concrete specializations such as TextShape, ImageShape and LineShape. They represent graphics of all types and were originally introduced to draw on a new or existing PDF page. The reverse however is also true: extract existing graphics on a PDF page as shapes. The central method is Page.CreateShapes.

The following c# code sample enumerates all shapes on each page of a PDF document and dumps their properties to the console.

C# code sample

1 static int shapeCollections = 0; 2 static int textShapes = 0; 3 static int imageShapes = 0; 4 static int freehandShapes = 0; 5 static int layerShapes = 0; 6 static int clipShapes = 0; 7 static int otherShapes = 0; 8 9 static void Main(string[] args) 10 { 11 using (FileStream fileIn = new FileStream(@"..\..\..\inputDocuments/vectorgraphics.pdf", 12 FileMode.Open, 13 FileAccess.Read)) 14 { 15 Document pdfIn = new Document(fileIn); 16 IterateShapes(pdfIn); 17 } 18 Console.WriteLine("Nr of shape collections = {0}", shapeCollections); 19 Console.WriteLine("Nr of text shapes = {0}", textShapes); 20 Console.WriteLine("Nr of image shapes = {0}", imageShapes); 21 Console.WriteLine("Nr of freehand shapes = {0}", freehandShapes); 22 Console.WriteLine("Nr of layer shapes = {0}", layerShapes); 23 Console.WriteLine("Nr of clip shapes = {0}", clipShapes); 24 Console.WriteLine("Nr of other shapes = {0}", otherShapes); 25 } 26 27 28 // 29 // iterate through all pages in a PDF document 30 // 31 static void IterateShapes(Document pdf) 32 { 33 foreach (Page page in pdf.Pages) 34 { 35 IterateShapes(page); 36 } 37 } 38 39 40 // 41 // Get all shapes in a PDF page (this will be a shape collection) 42 // 43 static void IterateShapes(Page page) 44 { 45 ShapeCollection shapes = page.CreateShapes(); 46 IterateShapes(shapes, ""); 47 } 48 49 50 // 51 // iterate through each shape in a shape collections (this may recurse) 52 // 53 static void IterateShapes(ShapeCollection shapes, string indent) 54 { 55 DumpShapeInfo(shapes, indent); 56 foreach (Shape shape in shapes) 57 { 58 DumpShapeInfo(shape, indent); 59 if (shape is ShapeCollection) 60 { 61 // recurse 62 IterateShapes(shape as ShapeCollection, indent + " "); 63 } 64 65 if (shape is LayerShape) 66 { 67 // A LayerShape is also as collection 68 IterateShapes(shape as LayerShape, indent + " "); 69 } 70 } 71 } 72 73 74 // 75 // iterate through all shapes in a layer shape 76 // 77 static void IterateShapes(LayerShape shapes, string indent) 78 { 79 foreach (Shape shape in shapes) 80 { 81 DumpShapeInfo(shape, indent); 82 } 83 } 84 85 86 // 87 // Dump information on any kind of shape 88 // 89 internal static void DumpShapeInfo(Shape shape, string indent) 90 { 91 if (shape is ShapeCollection) 92 { 93 DumpShapeInfo(shape as ShapeCollection, indent); 94 } 95 else if (shape is LayerShape) 96 { 97 DumpShapeInfo(shape as LayerShape, indent); 98 } 99 if (shape is TextShape) 100 { 101 DumpShapeInfo(shape as TextShape, indent); 102 } 103 else if (shape is ImageShape) 104 { 105 DumpShapeInfo(shape as ImageShape, indent); 106 } 107 else if (shape is FreeHandShape) 108 { 109 DumpShapeInfo(shape as FreeHandShape, indent); 110 } 111 else if (shape is LayerShape) 112 { 113 DumpShapeInfo(shape as LayerShape, indent); 114 } 115 else if (shape is ClipShape) 116 { 117 DumpShapeInfo(shape as ClipShape, indent); 118 } 119 else 120 { 121 Console.WriteLine("{0}Shape = some other type", indent, indent); 122 otherShapes++; 123 } 124 } 125 126 127 // 128 // Dump information on a shape collection 129 // 130 internal static void DumpShapeInfo(ShapeCollection shape, string indent) 131 { 132 Console.WriteLine("{0}Shape = shape collection", indent); 133 Console.WriteLine("{0} : N elements = {1}", indent, shape.Count); 134 shapeCollections++; 135 } 136 137 138 // 139 // Dump information on a layer shape 140 // 141 internal static void DumpShapeInfo(LayerShape shape, string indent) 142 { 143 Console.WriteLine("{0}Shape = Layer shape", indent); 144 Console.WriteLine("{0} : blendmode = {1}", indent, shape.BlendMode); 145 Console.WriteLine("{0} : opacity = {1}", indent, shape.Opacity); 146 Console.WriteLine("{0} : X, Y = {1} {2}", indent, shape.X, shape.Y); 147 layerShapes++; 148 } 149 150 151 // 152 // Dump information on a text shape 153 // 154 internal static void DumpShapeInfo(TextShape shape, string indent) 155 { 156 Console.WriteLine("{0}Shape = Text shape", indent); 157 Console.WriteLine("{0} : Font name = {1}", indent, shape.Font.FamilyName); 158 Console.WriteLine("{0} : Font weight = {1}", indent, shape.Font.Weight); 159 Console.WriteLine("{0} : Font size = {1}", indent, shape.FontSize); 160 Console.WriteLine("{0} : embed mode = {1}", indent, shape.Font.EmbedMode); 161 Console.WriteLine("{0} : bounding box left = {1}", indent, shape.BoundingBox.Left); 162 Console.WriteLine("{0} : bounding box top = {1}", indent, shape.BoundingBox.Top); 163 Console.WriteLine("{0} : bounding box Width = {1}", indent, shape.BoundingBox.Width); 164 Console.WriteLine("{0} : bounding box Height = {1}", indent, shape.BoundingBox.Height); 165 Console.WriteLine("{0} : blendmode = {1}", indent, shape.BlendMode); 166 Console.WriteLine("{0} : opacity = {1}", indent, shape.Opacity); 167 Console.WriteLine("{0} : X,Y = {1}, {2}", indent, shape.X, shape.Y); 168 Console.WriteLine("{0} : Bold = {1}", indent, shape.Bold); 169 Console.WriteLine("{0} : Italic = {1}", indent, shape.Italic); 170 Console.WriteLine("{0} : Underline = {1}", indent, shape.Underline); 171 Console.WriteLine("{0} : StrikeOut = {1}", indent, shape.StrikeOut); 172 Console.WriteLine("{0} : width = {1}", indent, shape.MeasuredWidth); 173 Console.WriteLine("{0} : height = {1}", indent, shape.MeasuredHeight); 174 textShapes++; 175 } 176 177 178 // 179 // Dump information on a image shape 180 // 181 internal static void DumpShapeInfo(ImageShape shape, string indent) 182 { 183 Console.WriteLine("{0}Shape = image shape", indent); 184 Console.WriteLine("{0} : blendmode = {1}", indent, shape.BlendMode); 185 Console.WriteLine("{0} : mask color = {1}", indent, shape.MaskColor); 186 Console.WriteLine("{0} : opacity = {1}", indent, shape.Opacity); 187 Console.WriteLine("{0} : X, Y = {1} {2}", indent, shape.X, shape.Y); 188 Console.WriteLine("{0} : width = {1}", indent, shape.Width); 189 Console.WriteLine("{0} : height = {1}", indent, shape.Height); 190 imageShapes++; 191 } 192 193 194 // 195 // Dump information on a freehand shape 196 // 197 internal static void DumpShapeInfo(FreeHandShape shape, string indent) 198 { 199 Console.WriteLine("{0}Shape = freehand shape", indent); 200 Console.WriteLine("{0} : blendmode = {1}", indent, shape.BlendMode); 201 Console.WriteLine("{0} : opacity = {1}", indent, shape.Opacity); 202 Console.WriteLine("{0} : N paths = {1}", indent, shape.Paths.Count); 203 Console.WriteLine("{0} : fillrule = {1}", indent, shape.FillRule); 204 Console.WriteLine("{0} : Brush = {1}", indent, shape.Brush); 205 Console.WriteLine("{0} : Pen = {1}", indent, shape.Pen); 206 Console.WriteLine("{0} : X, Y = {1} {2}", indent, shape.X, shape.Y); 207 freehandShapes++; 208 } 209 210 211 // 212 // Dump information on a clip shape 213 // 214 internal static void DumpShapeInfo(ClipShape shape, string indent) 215 { 216 Console.WriteLine("{0}Shape = Clip shape", indent); 217 Console.WriteLine("{0} : blendmode = {1}", indent, shape.BlendMode); 218 Console.WriteLine("{0} : fillrule = {1}", indent, shape.FillRule); 219 Console.WriteLine("{0} : opacity = {1}", indent, shape.Opacity); 220 Console.WriteLine("{0} : N paths = {1}", indent, shape.Paths.Count); 221 Console.WriteLine("{0} : X, Y = {1} {2}", indent, shape.X, shape.Y); 222 clipShapes++; 223 }

VB.NET code sample

1 Dim shapeCollections As Integer = 0 2 Dim textShapes As Integer = 0 3 Dim imageShapes As Integer = 0 4 Dim freehandShapes As Integer = 0 5 Dim layerShapes As Integer = 0 6 Dim clipShapes As Integer = 0 7 Dim otherShapes As Integer = 0 8 9 Private Sub Main(args As String()) 10 Using fileIn As New FileStream("..\..\..\inputDocuments/vectorgraphics.pdf", FileMode.Open, FileAccess.Read) 11 Dim pdfIn As New Document(fileIn) 12 IterateShapes(pdfIn) 13 End Using 14 Console.WriteLine("Nr of shape collections = {0}", shapeCollections) 15 Console.WriteLine("Nr of text shapes = {0}", textShapes) 16 Console.WriteLine("Nr of image shapes = {0}", imageShapes) 17 Console.WriteLine("Nr of freehand shapes = {0}", freehandShapes) 18 Console.WriteLine("Nr of layer shapes = {0}", layerShapes) 19 Console.WriteLine("Nr of clip shapes = {0}", clipShapes) 20 Console.WriteLine("Nr of other shapes = {0}", otherShapes) 21 End Sub 22 23 24 ' 25 ' iterate through all pages in a PDF document 26 ' 27 Private Sub IterateShapes(pdf As Document) 28 For Each page As Page In pdf.Pages 29 IterateShapes(page) 30 Next 31 End Sub 32 33 34 ' 35 ' Get all shapes in a PDF page (this will be a shape collection) 36 ' 37 Private Sub IterateShapes(page As Page) 38 Dim shapes As ShapeCollection = page.CreateShapes() 39 IterateShapes(shapes, "") 40 End Sub 41 42 43 ' 44 ' iterate through each shape in a shape collections (this may recurse) 45 ' 46 Private Sub IterateShapes(shapes As ShapeCollection, indent As String) 47 DumpShapeInfo(shapes, indent) 48 For Each shape As Shape In shapes 49 DumpShapeInfo(shape, indent) 50 If TypeOf shape Is ShapeCollection Then 51 ' recurse 52 IterateShapes(TryCast(shape, ShapeCollection), indent & Convert.ToString(" ")) 53 End If 54 55 If TypeOf shape Is LayerShape Then 56 ' A LayerShape is also as collection 57 IterateShapes(TryCast(shape, LayerShape), indent & Convert.ToString(" ")) 58 End If 59 Next 60 End Sub 61 62 63 ' 64 ' iterate through all shapes in a layer shape 65 ' 66 Private Sub IterateShapes(shapes As LayerShape, indent As String) 67 For Each shape As Shape In shapes 68 DumpShapeInfo(shape, indent) 69 Next 70 End Sub 71 72 73 ' 74 ' Dump information on any kind of shape 75 ' 76 Friend Sub DumpShapeInfo(shape As Shape, indent As String) 77 If TypeOf shape Is ShapeCollection Then 78 DumpShapeInfo(TryCast(shape, ShapeCollection), indent) 79 ElseIf TypeOf shape Is LayerShape Then 80 DumpShapeInfo(TryCast(shape, LayerShape), indent) 81 End If 82 If TypeOf shape Is TextShape Then 83 DumpShapeInfo(TryCast(shape, TextShape), indent) 84 ElseIf TypeOf shape Is ImageShape Then 85 DumpShapeInfo(TryCast(shape, ImageShape), indent) 86 ElseIf TypeOf shape Is FreeHandShape Then 87 DumpShapeInfo(TryCast(shape, FreeHandShape), indent) 88 ElseIf TypeOf shape Is LayerShape Then 89 DumpShapeInfo(TryCast(shape, LayerShape), indent) 90 ElseIf TypeOf shape Is ClipShape Then 91 DumpShapeInfo(TryCast(shape, ClipShape), indent) 92 Else 93 Console.WriteLine("{0}Shape = some other type", indent, indent) 94 otherShapes += 1 95 End If 96 End Sub 97 98 99 ' 100 ' Dump information on a shape collection 101 ' 102 Friend Sub DumpShapeInfo(shape As ShapeCollection, indent As String) 103 Console.WriteLine("{0}Shape = shape collection", indent) 104 Console.WriteLine("{0} : N elements = {1}", indent, shape.Count) 105 shapeCollections += 1 106 End Sub 107 108 109 110 ' 111 ' Dump information on a layer shape 112 ' 113 Friend Sub DumpShapeInfo(shape As LayerShape, indent As String) 114 Console.WriteLine("{0}Shape = Layer shape", indent) 115 Console.WriteLine("{0} : blendmode = {1}", indent, shape.BlendMode) 116 Console.WriteLine("{0} : opacity = {1}", indent, shape.Opacity) 117 Console.WriteLine("{0} : X, Y = {1} {2}", indent, shape.X, shape.Y) 118 layerShapes += 1 119 End Sub 120 121 122 123 124 ' 125 ' Dump information on a text shape 126 ' 127 Friend Sub DumpShapeInfo(shape As TextShape, indent As String) 128 Console.WriteLine("{0}Shape = Text shape", indent) 129 Console.WriteLine("{0} : Font name = {1}", indent, shape.Font.FamilyName) 130 Console.WriteLine("{0} : Font weight = {1}", indent, shape.Font.Weight) 131 Console.WriteLine("{0} : Font size = {1}", indent, shape.FontSize) 132 Console.WriteLine("{0} : embed mode = {1}", indent, shape.Font.EmbedMode) 133 Console.WriteLine("{0} : bounding box left = {1}", indent, shape.BoundingBox.Left) 134 Console.WriteLine("{0} : bounding box top = {1}", indent, shape.BoundingBox.Top) 135 Console.WriteLine("{0} : bounding box Width = {1}", indent, shape.BoundingBox.Width) 136 Console.WriteLine("{0} : bounding box Height = {1}", indent, shape.BoundingBox.Height) 137 Console.WriteLine("{0} : blendmode = {1}", indent, shape.BlendMode) 138 Console.WriteLine("{0} : opacity = {1}", indent, shape.Opacity) 139 Console.WriteLine("{0} : X,Y = {1}, {2}", indent, shape.X, shape.Y) 140 Console.WriteLine("{0} : Bold = {1}", indent, shape.Bold) 141 Console.WriteLine("{0} : Italic = {1}", indent, shape.Italic) 142 Console.WriteLine("{0} : Underline = {1}", indent, shape.Underline) 143 Console.WriteLine("{0} : StrikeOut = {1}", indent, shape.StrikeOut) 144 Console.WriteLine("{0} : width = {1}", indent, shape.MeasuredWidth) 145 Console.WriteLine("{0} : height = {1}", indent, shape.MeasuredHeight) 146 textShapes += 1 147 End Sub 148 149 150 ' 151 ' Dump information on a image shape 152 ' 153 Friend Sub DumpShapeInfo(shape As ImageShape, indent As String) 154 Console.WriteLine("{0}Shape = image shape", indent) 155 Console.WriteLine("{0} : blendmode = {1}", indent, shape.BlendMode) 156 Console.WriteLine("{0} : mask color = {1}", indent, shape.MaskColor) 157 Console.WriteLine("{0} : opacity = {1}", indent, shape.Opacity) 158 Console.WriteLine("{0} : X, Y = {1} {2}", indent, shape.X, shape.Y) 159 Console.WriteLine("{0} : width = {1}", indent, shape.Width) 160 Console.WriteLine("{0} : height = {1}", indent, shape.Height) 161 imageShapes += 1 162 End Sub 163 164 165 ' 166 ' Dump information on a freehand shape 167 ' 168 Friend Sub DumpShapeInfo(shape As FreeHandShape, indent As String) 169 Console.WriteLine("{0}Shape = freehand shape", indent) 170 Console.WriteLine("{0} : blendmode = {1}", indent, shape.BlendMode) 171 Console.WriteLine("{0} : opacity = {1}", indent, shape.Opacity) 172 Console.WriteLine("{0} : N paths = {1}", indent, shape.Paths.Count) 173 Console.WriteLine("{0} : fillrule = {1}", indent, shape.FillRule) 174 Console.WriteLine("{0} : Brush = {1}", indent, shape.Brush) 175 Console.WriteLine("{0} : Pen = {1}", indent, shape.Pen) 176 Console.WriteLine("{0} : X, Y = {1} {2}", indent, shape.X, shape.Y) 177 freehandShapes += 1 178 End Sub 179 180 ' 181 ' Dump information on a clip shape 182 ' 183 Friend Sub DumpShapeInfo(shape As ClipShape, indent As String) 184 Console.WriteLine("{0}Shape = Clip shape", indent) 185 Console.WriteLine("{0} : blendmode = {1}", indent, shape.BlendMode) 186 Console.WriteLine("{0} : fillrule = {1}", indent, shape.FillRule) 187 Console.WriteLine("{0} : opacity = {1}", indent, shape.Opacity) 188 Console.WriteLine("{0} : N paths = {1}", indent, shape.Paths.Count) 189 Console.WriteLine("{0} : X, Y = {1} {2}", indent, shape.X, shape.Y) 190 clipShapes += 1 191 End Sub