Extract glyph boxes from PDF

Content extraction
3/21/2014

Downloads

This sample demonstrates how to extract glyph boxes.

This sample creates a bitmap for each page and draws boxes for each glyph. It takes into account the orientation of the page, as well as its cropbox and mediabox so that the bitmap resembles the page as shown by a PDF viewer. The main routine here is CreateBoxesBitmap. It takes a page as an argument and returns a Bitmap with drawn boxes.

For the following PDF page:

page.png

We get the following result:

result.png

C# code sample

1 static void Main(string[] args) 2 { 3 using (FileStream fileIn = new FileStream(@"..\..\..\inputdocuments\R0.pdf", FileMode.Open, FileAccess.Read)) 4 { 5 //create document 6 Document document = new Document(fileIn); 7 8 foreach (Page page in document.Pages) 9 { 10 System.Drawing.Bitmap bitmap = CreateBoxesBitmap(page); 11 bitmap.Save(@"..\..\out.png", System.Drawing.Imaging.ImageFormat.Png); 12 } 13 } 14 } 15 16 private static System.Drawing.Bitmap CreateBoxesBitmap(Page page) 17 { 18 // Compute the part of the page that is visible in a viewer. 19 Rectangle visibleRectangle = GetVisibleRectangle(page); 20 21 // determine the size taking the orientation into account 22 int width = (int)Math.Round(visibleRectangle.Width); 23 int height = (int)Math.Round(visibleRectangle.Height); 24 25 Orientation orientation = page.Orientation; 26 if (orientation == Orientation.Rotate90 || orientation == Orientation.Rotate270) 27 { 28 // swap width and height. 29 int temp = width; 30 width = height; 31 height = temp; 32 } 33 34 // create the resulting bitmap 35 var bitmap = new System.Drawing.Bitmap(width, height); 36 using (System.Drawing.Graphics graphics = System.Drawing.Graphics.FromImage(bitmap)) 37 using (System.Drawing.Pen pen = new System.Drawing.Pen(System.Drawing.Color.Red)) 38 { 39 graphics.Clear(System.Drawing.Color.White); 40 41 // retrieve all glyphs on the current page and draw a rectangle for each. 42 foreach (Glyph glyph in page.Glyphs) 43 { 44 // we convert each coordinate into a GDI coordinate 45 System.Drawing.PointF bottomLeft = PDFPointToGDI(glyph.BottomLeft, visibleRectangle, orientation); 46 System.Drawing.PointF bottomRight = PDFPointToGDI(glyph.BottomRight, visibleRectangle, orientation); 47 System.Drawing.PointF topRight = PDFPointToGDI(glyph.TopRight, visibleRectangle, orientation); 48 System.Drawing.PointF topLeft = PDFPointToGDI(glyph.TopLeft, visibleRectangle, orientation); 49 50 System.Drawing.PointF[] points = new[] { bottomLeft, bottomRight, topRight, topLeft }; 51 52 // draw glyph box 53 graphics.DrawPolygon(pen, points); 54 } 55 } 56 57 return bitmap; 58 }

VB.NET code sample

1 Private Sub Main(args As String()) 2 Using fileIn As New FileStream("..\..\..\inputdocuments\R0.pdf", FileMode.Open, FileAccess.Read) 3 'create document 4 Dim document As New Document(fileIn) 5 6 For Each page As Page In document.Pages 7 Dim bitmap As System.Drawing.Bitmap = CreateBoxesBitmap(page) 8 bitmap.Save("..\..\out.png", System.Drawing.Imaging.ImageFormat.Png) 9 Next 10 End Using 11 End Sub 12 13 Private Function CreateBoxesBitmap(page As Page) As System.Drawing.Bitmap 14 ' Compute the part of the page that is visible in a viewer. 15 Dim visibleRectangle As Rectangle = GetVisibleRectangle(page) 16 17 ' determine the size taking the orientation into account 18 Dim width As Integer = CInt(Math.Round(visibleRectangle.Width)) 19 Dim height As Integer = CInt(Math.Round(visibleRectangle.Height)) 20 21 Dim orientation__1 As Orientation = page.Orientation 22 If orientation__1 = Orientation.Rotate90 OrElse orientation__1 = Orientation.Rotate270 Then 23 ' swap width and height. 24 Dim temp As Integer = width 25 width = height 26 height = temp 27 End If 28 29 ' create the resulting bitmap 30 Dim bitmap = New System.Drawing.Bitmap(width, height) 31 Using graphics As System.Drawing.Graphics = System.Drawing.Graphics.FromImage(bitmap) 32 Using pen As New System.Drawing.Pen(System.Drawing.Color.Red) 33 graphics.Clear(System.Drawing.Color.White) 34 35 ' retrieve all glyphs on the current page and draw a rectangle for each. 36 For Each glyph As Glyph In page.Glyphs 37 ' we convert each coordinate into a GDI coordinate 38 Dim bottomLeft As System.Drawing.PointF = PDFPointToGDI(glyph.BottomLeft, visibleRectangle, orientation__1) 39 Dim bottomRight As System.Drawing.PointF = PDFPointToGDI(glyph.BottomRight, visibleRectangle, orientation__1) 40 Dim topRight As System.Drawing.PointF = PDFPointToGDI(glyph.TopRight, visibleRectangle, orientation__1) 41 Dim topLeft As System.Drawing.PointF = PDFPointToGDI(glyph.TopLeft, visibleRectangle, orientation__1) 42 43 Dim points As System.Drawing.PointF() = {bottomLeft, bottomRight, topRight, topLeft} 44 45 ' draw glyph box 46 graphics.DrawPolygon(pen, points) 47 Next 48 End Using 49 End Using 50 51 Return bitmap 52 End Function

Note that we need to convert each coordinate into a GDI coordinate, as PDF has its origin at the bottom left of the page, and the page may be rotated as well. Below is the code of the PdfPointToGdi routine.

C# code sample

1 static Rectangle GetVisibleRectangle(Page page) 2 { 3 Rectangle rectangle = new Rectangle(0, 0, page.Width, page.Height); 4 5 Rectangle mediaBox = page.MediaBox; 6 if (mediaBox != null) 7 { 8 rectangle = Intersection(rectangle, mediaBox); 9 } 10 11 Rectangle cropBox = page.CropBox; 12 if (null != cropBox) 13 { 14 rectangle = Intersection(rectangle, cropBox); 15 } 16 return rectangle; 17 } 18 19 static System.Drawing.PointF PDFPointToGDI(System.Drawing.PointF point, Rectangle rectangle, Orientation orientation) 20 { 21 // Adjust for origin of the visible rectangle, which may not be at (0,0). 22 double x = point.X - rectangle.Left; 23 double y = point.Y - rectangle.Bottom; 24 25 switch (orientation) 26 { 27 case Orientation.Rotate0: 28 // just 'flip' the coordinate over the y axis. 29 return new System.Drawing.PointF((float)x, (float)(rectangle.Height - y)); 30 31 case Orientation.Rotate90: 32 // exchange x and y, and perform appropiate flipping. 33 return new System.Drawing.PointF((float)(rectangle.Height - y), (float)(rectangle.Width - x)); 34 35 case Orientation.Rotate180: 36 // Pointwise mirror of Rotate0. 37 return new System.Drawing.PointF((float)(rectangle.Width - x), (float)y); 38 39 case Orientation.Rotate270: 40 // Pointwise mirror of Rotate90. 41 return new System.Drawing.PointF((float)y, (float)x); 42 43 default: 44 return point; 45 } 46 } 47 48 static Rectangle Intersection(Rectangle rect1, Rectangle rect2) 49 { 50 double minX = Math.Max(rect1.Left, rect2.Left); // maximum of left sides. 51 double maxX = Math.Min(rect1.Left + rect1.Width, rect2.Left + rect2.Width); // minimum of right sides. 52 double minY = Math.Max(rect1.Bottom, rect2.Bottom); // maximum of bottom sides. 53 double maxY = Math.Min(rect1.Bottom + rect1.Height, rect2.Bottom + rect2.Height); // minimum of bottom sides. 54 55 return new Rectangle(minX, minY, maxX - minX, maxY - minY); 56 }

VB.NET code sample

1 Private Function GetVisibleRectangle(page As Page) As Rectangle 2 Dim rectangle As New Rectangle(0, 0, page.Width, page.Height) 3 4 Dim mediaBox As Rectangle = page.MediaBox 5 If mediaBox IsNot Nothing Then 6 rectangle = Intersection(rectangle, mediaBox) 7 End If 8 9 Dim cropBox As Rectangle = page.CropBox 10 If cropBox IsNot Nothing Then 11 rectangle = Intersection(rectangle, cropBox) 12 End If 13 Return rectangle 14 End Function 15 16 Private Function PDFPointToGDI(point As System.Drawing.PointF, rectangle As Rectangle, orientation__1 As Orientation) As System.Drawing.PointF 17 ' Adjust for origin of the visible rectangle, which may not be at (0,0). 18 Dim x As Double = point.X - rectangle.Left 19 Dim y As Double = point.Y - rectangle.Bottom 20 21 Select Case orientation__1 22 Case Orientation.Rotate0 23 ' just 'flip' the coordinate over the y axis. 24 Return New System.Drawing.PointF(CSng(x), CSng(rectangle.Height - y)) 25 26 Case Orientation.Rotate90 27 ' exchange x and y, and perform appropiate flipping. 28 Return New System.Drawing.PointF(CSng(rectangle.Height - y), CSng(rectangle.Width - x)) 29 30 Case Orientation.Rotate180 31 ' Pointwise mirror of Rotate0. 32 Return New System.Drawing.PointF(CSng(rectangle.Width - x), CSng(y)) 33 34 Case Orientation.Rotate270 35 ' Pointwise mirror of Rotate90. 36 Return New System.Drawing.PointF(CSng(y), CSng(x)) 37 Case Else 38 39 Return point 40 End Select 41 End Function 42 43 Private Function Intersection(rect1 As Rectangle, rect2 As Rectangle) As Rectangle 44 Dim minX As Double = Math.Max(rect1.Left, rect2.Left) 45 ' maximum of left sides. 46 Dim maxX As Double = Math.Min(rect1.Left + rect1.Width, rect2.Left + rect2.Width) 47 ' minimum of right sides. 48 Dim minY As Double = Math.Max(rect1.Bottom, rect2.Bottom) 49 ' maximum of bottom sides. 50 Dim maxY As Double = Math.Min(rect1.Bottom + rect1.Height, rect2.Bottom + rect2.Height) 51 ' minimum of bottom sides. 52 Return New Rectangle(minX, minY, maxX - minX, maxY - minY) 53 End Function