Extract graphics from PDF

Content extraction
7/25/2014

This c# code sample shows how to extract text, images and curves as shapes from a PDF document.

extract-graphics-from-pdf.PNG

Shapes

The Shape class is an abstract class with concrete specializations such as TextShape, ImageShape and LineShape. They represent graphics of all types and were originally introduced to draw on a new or existing PDF page. The reverse however is also true: extract existing graphics on a PDF page as shapes. The central method is Page.CreateShapes.

The following c# code sample enumerates all shapes on each page of a PDF document and dumps their properties to the console.

static int shapeCollections = 0;
static int textShapes = 0;
static int imageShapes = 0;
static int freehandShapes = 0;
static int layerShapes = 0;
static int clipShapes = 0;
static int otherShapes = 0;

static void Main(string[] args)
{
    using (FileStream fileIn = new FileStream(@"..\..\..\inputDocuments/vectorgraphics.pdf",
                                        FileMode.Open,
                                        FileAccess.Read))
    {
        Document pdfIn = new Document(fileIn);
        IterateShapes(pdfIn);
    }
    Console.WriteLine("Nr of shape collections = {0}", shapeCollections);
    Console.WriteLine("Nr of text shapes       = {0}", textShapes);
    Console.WriteLine("Nr of image shapes      = {0}", imageShapes);
    Console.WriteLine("Nr of freehand shapes   = {0}", freehandShapes);
    Console.WriteLine("Nr of layer shapes      = {0}", layerShapes);
    Console.WriteLine("Nr of clip shapes       = {0}", clipShapes);
    Console.WriteLine("Nr of other shapes      = {0}", otherShapes);
}


//
// iterate through all pages in a PDF document
//
static void IterateShapes(Document pdf)
{
    foreach (Page page in pdf.Pages)
    {
        IterateShapes(page);
    }
}


//
// Get all shapes in a PDF page (this will be a shape collection)
//
static void IterateShapes(Page page)
{
    ShapeCollection shapes = page.CreateShapes();
    IterateShapes(shapes, "");
}


//
// iterate through each shape in a shape collections (this may recurse)
//
static void IterateShapes(ShapeCollection shapes, string indent)
{
    DumpShapeInfo(shapes, indent);
    foreach (Shape shape in shapes)
    {
        DumpShapeInfo(shape, indent);
        if (shape is ShapeCollection)
        {
            // recurse
            IterateShapes(shape as ShapeCollection, indent + "  ");
        }

        if (shape is LayerShape)
        {
            // A LayerShape is also as collection
            IterateShapes(shape as LayerShape, indent + "  ");
        }
    }
}


//
// iterate through all shapes in a layer shape
//
static void IterateShapes(LayerShape shapes, string indent)
{
    foreach (Shape shape in shapes)
    {
        DumpShapeInfo(shape, indent);
    }
}


//
// Dump information on any kind of shape
//
internal static void DumpShapeInfo(Shape shape, string indent)
{
    if (shape is ShapeCollection)
    {
        DumpShapeInfo(shape as ShapeCollection, indent);
    }
    else if (shape is LayerShape)
    {
        DumpShapeInfo(shape as LayerShape, indent);
    }
    if (shape is TextShape)
    {
        DumpShapeInfo(shape as TextShape, indent);
    }
    else if (shape is ImageShape)
    {
        DumpShapeInfo(shape as ImageShape, indent);
    }
    else if (shape is FreeHandShape)
    {
        DumpShapeInfo(shape as FreeHandShape, indent);
    }
    else if (shape is LayerShape)
    {
        DumpShapeInfo(shape as LayerShape, indent);
    }
    else if (shape is ClipShape)
    {
        DumpShapeInfo(shape as ClipShape, indent);
    }
    else
    {
        Console.WriteLine("{0}Shape = some other type", indent, indent);
        otherShapes++;
    }
}


//
// Dump information on a shape collection
//
internal static void DumpShapeInfo(ShapeCollection shape, string indent)
{
    Console.WriteLine("{0}Shape = shape collection", indent);
    Console.WriteLine("{0}    : N elements = {1}", indent, shape.Count);
    shapeCollections++;
}


//
// Dump information on a layer shape
//
internal static void DumpShapeInfo(LayerShape shape, string indent)
{
    Console.WriteLine("{0}Shape = Layer shape", indent);
    Console.WriteLine("{0}    : blendmode = {1}", indent, shape.BlendMode);
    Console.WriteLine("{0}    : opacity = {1}", indent, shape.Opacity);
    Console.WriteLine("{0}    : X, Y = {1} {2}", indent, shape.X, shape.Y);
    layerShapes++;
}


//
// Dump information on a text shape
//
internal static void DumpShapeInfo(TextShape shape, string indent)
{
    Console.WriteLine("{0}Shape = Text shape", indent);
    Console.WriteLine("{0}    : Font name = {1}", indent, shape.Font.FamilyName);
    Console.WriteLine("{0}    : Font weight = {1}", indent, shape.Font.Weight);
    Console.WriteLine("{0}    : Font size = {1}", indent, shape.FontSize);
    Console.WriteLine("{0}    : embed mode = {1}", indent, shape.Font.EmbedMode);
    Console.WriteLine("{0}    : bounding box left = {1}", indent, shape.BoundingBox.Left);
    Console.WriteLine("{0}    : bounding box top = {1}", indent, shape.BoundingBox.Top);
    Console.WriteLine("{0}    : bounding box Width = {1}", indent, shape.BoundingBox.Width);
    Console.WriteLine("{0}    : bounding box Height = {1}", indent, shape.BoundingBox.Height);
    Console.WriteLine("{0}    : blendmode = {1}", indent, shape.BlendMode);
    Console.WriteLine("{0}    : opacity = {1}", indent, shape.Opacity);
    Console.WriteLine("{0}    : X,Y = {1}, {2}", indent, shape.X, shape.Y);
    Console.WriteLine("{0}    : Bold = {1}", indent, shape.Bold);
    Console.WriteLine("{0}    : Italic = {1}", indent, shape.Italic);
    Console.WriteLine("{0}    : Underline = {1}", indent, shape.Underline);
    Console.WriteLine("{0}    : StrikeOut = {1}", indent, shape.StrikeOut);
    Console.WriteLine("{0}    : width = {1}", indent, shape.MeasuredWidth);
    Console.WriteLine("{0}    : height = {1}", indent, shape.MeasuredHeight);
    textShapes++;
}


//
// Dump information on a image shape
//
internal static void DumpShapeInfo(ImageShape shape, string indent)
{
    Console.WriteLine("{0}Shape = image shape", indent);
    Console.WriteLine("{0}    : blendmode = {1}", indent, shape.BlendMode);
    Console.WriteLine("{0}    : mask color = {1}", indent, shape.MaskColor);
    Console.WriteLine("{0}    : opacity = {1}", indent, shape.Opacity);
    Console.WriteLine("{0}    : X, Y = {1} {2}", indent, shape.X, shape.Y);
    Console.WriteLine("{0}    : width = {1}", indent, shape.Width);
    Console.WriteLine("{0}    : height = {1}", indent, shape.Height);
    imageShapes++;
}


//
// Dump information on a freehand shape
//
internal static void DumpShapeInfo(FreeHandShape shape, string indent)
{
    Console.WriteLine("{0}Shape = freehand shape", indent);
    Console.WriteLine("{0}    : blendmode = {1}", indent, shape.BlendMode);
    Console.WriteLine("{0}    : opacity = {1}", indent, shape.Opacity);
    Console.WriteLine("{0}    : N paths = {1}", indent, shape.Paths.Count);
    Console.WriteLine("{0}    : fillrule = {1}", indent, shape.FillRule);
    Console.WriteLine("{0}    : Brush = {1}", indent, shape.Brush);
    Console.WriteLine("{0}    : Pen = {1}", indent, shape.Pen);
    Console.WriteLine("{0}    : X, Y = {1} {2}", indent, shape.X, shape.Y);
    freehandShapes++;
}


//
// Dump information on a clip shape
//
internal static void DumpShapeInfo(ClipShape shape, string indent)
{
    Console.WriteLine("{0}Shape = Clip shape", indent);
    Console.WriteLine("{0}    : blendmode = {1}", indent, shape.BlendMode);
    Console.WriteLine("{0}    : fillrule = {1}", indent, shape.FillRule);
    Console.WriteLine("{0}    : opacity = {1}", indent, shape.Opacity);
    Console.WriteLine("{0}    : N paths = {1}", indent, shape.Paths.Count);
    Console.WriteLine("{0}    : X, Y = {1} {2}", indent, shape.X, shape.Y);
    clipShapes++;
}
    Dim shapeCollections As Integer = 0
    Dim textShapes As Integer = 0
    Dim imageShapes As Integer = 0
    Dim freehandShapes As Integer = 0
    Dim layerShapes As Integer = 0
    Dim clipShapes As Integer = 0
    Dim otherShapes As Integer = 0

    Private Sub Main(args As String())
        Using fileIn As New FileStream("..\..\..\inputDocuments/vectorgraphics.pdf", FileMode.Open, FileAccess.Read)
            Dim pdfIn As New Document(fileIn)
            IterateShapes(pdfIn)
        End Using
        Console.WriteLine("Nr of shape collections = {0}", shapeCollections)
        Console.WriteLine("Nr of text shapes       = {0}", textShapes)
        Console.WriteLine("Nr of image shapes      = {0}", imageShapes)
        Console.WriteLine("Nr of freehand shapes   = {0}", freehandShapes)
        Console.WriteLine("Nr of layer shapes      = {0}", layerShapes)
        Console.WriteLine("Nr of clip shapes       = {0}", clipShapes)
        Console.WriteLine("Nr of other shapes      = {0}", otherShapes)
    End Sub


    '
    ' iterate through all pages in a PDF document
    '
    Private Sub IterateShapes(pdf As Document)
        For Each page As Page In pdf.Pages
            IterateShapes(page)
        Next
    End Sub


    '
    ' Get all shapes in a PDF page (this will be a shape collection)
    '
    Private Sub IterateShapes(page As Page)
        Dim shapes As ShapeCollection = page.CreateShapes()
        IterateShapes(shapes, "")
    End Sub


    '
    ' iterate through each shape in a shape collections (this may recurse)
    '
    Private Sub IterateShapes(shapes As ShapeCollection, indent As String)
        DumpShapeInfo(shapes, indent)
        For Each shape As Shape In shapes
            DumpShapeInfo(shape, indent)
            If TypeOf shape Is ShapeCollection Then
                ' recurse
                IterateShapes(TryCast(shape, ShapeCollection), indent & Convert.ToString("  "))
            End If

            If TypeOf shape Is LayerShape Then
                ' A LayerShape is also as collection
                IterateShapes(TryCast(shape, LayerShape), indent & Convert.ToString("  "))
            End If
        Next
    End Sub


    '
    ' iterate through all shapes in a layer shape
    '
    Private Sub IterateShapes(shapes As LayerShape, indent As String)
        For Each shape As Shape In shapes
            DumpShapeInfo(shape, indent)
        Next
    End Sub


    '
    ' Dump information on any kind of shape
    '
    Friend Sub DumpShapeInfo(shape As Shape, indent As String)
        If TypeOf shape Is ShapeCollection Then
            DumpShapeInfo(TryCast(shape, ShapeCollection), indent)
        ElseIf TypeOf shape Is LayerShape Then
            DumpShapeInfo(TryCast(shape, LayerShape), indent)
        End If
        If TypeOf shape Is TextShape Then
            DumpShapeInfo(TryCast(shape, TextShape), indent)
        ElseIf TypeOf shape Is ImageShape Then
            DumpShapeInfo(TryCast(shape, ImageShape), indent)
        ElseIf TypeOf shape Is FreeHandShape Then
            DumpShapeInfo(TryCast(shape, FreeHandShape), indent)
        ElseIf TypeOf shape Is LayerShape Then
            DumpShapeInfo(TryCast(shape, LayerShape), indent)
        ElseIf TypeOf shape Is ClipShape Then
            DumpShapeInfo(TryCast(shape, ClipShape), indent)
        Else
            Console.WriteLine("{0}Shape = some other type", indent, indent)
            otherShapes += 1
        End If
    End Sub


    '
    ' Dump information on a shape collection
    '
    Friend Sub DumpShapeInfo(shape As ShapeCollection, indent As String)
        Console.WriteLine("{0}Shape = shape collection", indent)
        Console.WriteLine("{0}    : N elements = {1}", indent, shape.Count)
        shapeCollections += 1
    End Sub



    '
    ' Dump information on a layer shape
    '
    Friend Sub DumpShapeInfo(shape As LayerShape, indent As String)
        Console.WriteLine("{0}Shape = Layer shape", indent)
        Console.WriteLine("{0}    : blendmode = {1}", indent, shape.BlendMode)
        Console.WriteLine("{0}    : opacity = {1}", indent, shape.Opacity)
        Console.WriteLine("{0}    : X, Y = {1} {2}", indent, shape.X, shape.Y)
        layerShapes += 1
    End Sub




    '
    ' Dump information on a text shape
    '
    Friend Sub DumpShapeInfo(shape As TextShape, indent As String)
        Console.WriteLine("{0}Shape = Text shape", indent)
        Console.WriteLine("{0}    : Font name = {1}", indent, shape.Font.FamilyName)
        Console.WriteLine("{0}    : Font weight = {1}", indent, shape.Font.Weight)
        Console.WriteLine("{0}    : Font size = {1}", indent, shape.FontSize)
        Console.WriteLine("{0}    : embed mode = {1}", indent, shape.Font.EmbedMode)
        Console.WriteLine("{0}    : bounding box left = {1}", indent, shape.BoundingBox.Left)
        Console.WriteLine("{0}    : bounding box top = {1}", indent, shape.BoundingBox.Top)
        Console.WriteLine("{0}    : bounding box Width = {1}", indent, shape.BoundingBox.Width)
        Console.WriteLine("{0}    : bounding box Height = {1}", indent, shape.BoundingBox.Height)
        Console.WriteLine("{0}    : blendmode = {1}", indent, shape.BlendMode)
        Console.WriteLine("{0}    : opacity = {1}", indent, shape.Opacity)
        Console.WriteLine("{0}    : X,Y = {1}, {2}", indent, shape.X, shape.Y)
        Console.WriteLine("{0}    : Bold = {1}", indent, shape.Bold)
        Console.WriteLine("{0}    : Italic = {1}", indent, shape.Italic)
        Console.WriteLine("{0}    : Underline = {1}", indent, shape.Underline)
        Console.WriteLine("{0}    : StrikeOut = {1}", indent, shape.StrikeOut)
        Console.WriteLine("{0}    : width = {1}", indent, shape.MeasuredWidth)
        Console.WriteLine("{0}    : height = {1}", indent, shape.MeasuredHeight)
        textShapes += 1
    End Sub


    '
    ' Dump information on a image shape
    '
    Friend Sub DumpShapeInfo(shape As ImageShape, indent As String)
        Console.WriteLine("{0}Shape = image shape", indent)
        Console.WriteLine("{0}    : blendmode = {1}", indent, shape.BlendMode)
        Console.WriteLine("{0}    : mask color = {1}", indent, shape.MaskColor)
        Console.WriteLine("{0}    : opacity = {1}", indent, shape.Opacity)
        Console.WriteLine("{0}    : X, Y = {1} {2}", indent, shape.X, shape.Y)
        Console.WriteLine("{0}    : width = {1}", indent, shape.Width)
        Console.WriteLine("{0}    : height = {1}", indent, shape.Height)
        imageShapes += 1
    End Sub


    '
    ' Dump information on a freehand shape
    '
    Friend Sub DumpShapeInfo(shape As FreeHandShape, indent As String)
        Console.WriteLine("{0}Shape = freehand shape", indent)
        Console.WriteLine("{0}    : blendmode = {1}", indent, shape.BlendMode)
        Console.WriteLine("{0}    : opacity = {1}", indent, shape.Opacity)
        Console.WriteLine("{0}    : N paths = {1}", indent, shape.Paths.Count)
        Console.WriteLine("{0}    : fillrule = {1}", indent, shape.FillRule)
        Console.WriteLine("{0}    : Brush = {1}", indent, shape.Brush)
        Console.WriteLine("{0}    : Pen = {1}", indent, shape.Pen)
        Console.WriteLine("{0}    : X, Y = {1} {2}", indent, shape.X, shape.Y)
        freehandShapes += 1
    End Sub

    '
    ' Dump information on a clip shape
    '
    Friend Sub DumpShapeInfo(shape As ClipShape, indent As String)
        Console.WriteLine("{0}Shape = Clip shape", indent)
        Console.WriteLine("{0}    : blendmode = {1}", indent, shape.BlendMode)
        Console.WriteLine("{0}    : fillrule = {1}", indent, shape.FillRule)
        Console.WriteLine("{0}    : opacity = {1}", indent, shape.Opacity)
        Console.WriteLine("{0}    : N paths = {1}", indent, shape.Paths.Count)
        Console.WriteLine("{0}    : X, Y = {1} {2}", indent, shape.X, shape.Y)
        clipShapes += 1
    End Sub