You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2010/03/09 20:18:34 UTC

svn commit: r921062 - in /pdfbox/trunk/src/main/java/org/apache/pdfbox: pdmodel/documentinterchange/logicalstructure/ pdmodel/documentinterchange/markedcontent/ util/ util/operator/

Author: lehmi
Date: Tue Mar  9 19:18:33 2010
New Revision: 921062

URL: http://svn.apache.org/viewvc?rev=921062&view=rev
Log:
PDFBOX-7: added support for the extraction of information of tagged pdfs. Patch by Johannes Koch (johannes dot koch at fit dot fraunhofer dot de)

Added:
    pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDMarkedContentReference.java
    pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDObjectReference.java
    pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDStructureNode.java
    pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/Revisions.java
    pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/markedcontent/
    pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/markedcontent/PDMarkedContent.java
    pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/markedcontent/package.html
    pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java
    pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/BeginMarkedContentSequence.java
    pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/BeginMarkedContentSequenceWithProperties.java
    pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/EndMarkedContentSequence.java
Modified:
    pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDStructureElement.java
    pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDStructureTreeRoot.java
    pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/Invoke.java

Added: pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDMarkedContentReference.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDMarkedContentReference.java?rev=921062&view=auto
==============================================================================
--- pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDMarkedContentReference.java (added)
+++ pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDMarkedContentReference.java Tue Mar  9 19:18:33 2010
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure;
+
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.common.COSObjectable;
+
+/**
+ * A marked-content reference.
+ * 
+ * @author Koch
+ * @version $Revision: $
+ */
+public class PDMarkedContentReference implements COSObjectable
+{
+
+    public static final String TYPE = "MCR";
+
+    private COSDictionary dictionary;
+
+    protected COSDictionary getCOSDictionary()
+    {
+        return this.dictionary;
+    }
+
+    /**
+     * Default constructor
+     */
+    public PDMarkedContentReference()
+    {
+        this.dictionary = new COSDictionary();
+        this.dictionary.setName(COSName.TYPE, TYPE);
+    }
+
+    /**
+     * Constructor for an existing marked content reference.
+     * 
+     * @param pageDic the page dictionary
+     * @param mcid the marked content indentifier
+     */
+    public PDMarkedContentReference(COSDictionary dictionary)
+    {
+        this.dictionary = dictionary;
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    public COSBase getCOSObject()
+    {
+        return this.dictionary;
+    }
+
+    /**
+     * Gets the page.
+     * 
+     * @return the page
+     */
+    public PDPage getPage()
+    {
+        COSDictionary pg = (COSDictionary) this.getCOSDictionary().getDictionaryObject("Pg");
+        if (pg != null)
+        {
+            return new PDPage(pg);
+        }
+        return null;
+    }
+
+    /**
+     * Sets the page.
+     * 
+     * @param page the page
+     */
+    public void setPage(PDPage page)
+    {
+        this.getCOSDictionary().setItem("Pg", page);
+    }
+
+    /**
+     * Gets the marked content identifier.
+     * 
+     * @return the marked content identifier
+     */
+    public int getMCID()
+    {
+        return this.getCOSDictionary().getInt("MCID");
+    }
+
+    /**
+     * Sets the marked content identifier.
+     * 
+     * @param mcid the marked content identifier
+     */
+    public void setMCID(int mcid)
+    {
+        this.getCOSDictionary().setInt("MCID", mcid);
+    }
+
+
+    @Override
+    public String toString()
+    {
+        return new StringBuilder()
+            .append("mcid=").append(this.getMCID()).toString();
+    }
+
+}

Added: pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDObjectReference.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDObjectReference.java?rev=921062&view=auto
==============================================================================
--- pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDObjectReference.java (added)
+++ pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDObjectReference.java Tue Mar  9 19:18:33 2010
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure;
+
+import java.io.IOException;
+
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.pdmodel.common.COSObjectable;
+import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
+
+/**
+ * An object reference.
+ * 
+ * @author Koch
+ * @version $Revision: $
+ */
+public class PDObjectReference implements COSObjectable
+{
+
+    public static final String TYPE = "OBJR";
+
+    private COSDictionary dictionary;
+
+    protected COSDictionary getCOSDictionary()
+    {
+        return this.dictionary;
+    }
+
+    /**
+     * Default Constructor.
+     *
+     */
+    public PDObjectReference()
+    {
+        this.dictionary = new COSDictionary();
+        this.dictionary.setName(COSName.TYPE, TYPE);
+    }
+
+    /**
+     * Constructor for an existing object reference.
+     *
+     * @param dictionary The existing dictionary.
+     */
+    public PDObjectReference(COSDictionary dictionary)
+    {
+        this.dictionary = dictionary;
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    public COSBase getCOSObject()
+    {
+        return this.dictionary;
+    }
+
+    /**
+     * Gets a higher-level object for the referenced object.
+     * Currently this method may return a {@link PDAnnotation},
+     * a {@link PDXObject} or <code>null</code>.
+     * 
+     * @return a higher-level object for the referenced object
+     */
+    public COSObjectable getReferencedObject()
+    {
+        COSBase obj = this.getCOSDictionary().getDictionaryObject("Obj");
+        try
+        {
+            return PDAnnotation.createAnnotation(obj);
+        }
+        catch (IOException e)
+        {
+            // No Annotation
+            try
+            {
+                return PDXObject.createXObject(obj);
+            }
+            catch (IOException e1)
+            {
+                // No XObject
+                // TODO what else can be the target of the object reference?
+            }
+        }
+        return null;
+    }
+
+    /**
+     * Sets the referenced annotation.
+     * 
+     * @param annotation the referenced annotation
+     */
+    public void setReferencedObject(PDAnnotation annotation)
+    {
+        this.getCOSDictionary().setItem("Obj", annotation);
+    }
+
+    /**
+     * Sets the referenced XObject.
+     * 
+     * @param xobject the referenced XObject
+     */
+    public void setReferencedObject(PDXObject xobject)
+    {
+        this.getCOSDictionary().setItem("Obj", xobject);
+    }
+
+}

Modified: pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDStructureElement.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDStructureElement.java?rev=921062&r1=921061&r2=921062&view=diff
==============================================================================
--- pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDStructureElement.java (original)
+++ pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDStructureElement.java Tue Mar  9 19:18:33 2010
@@ -16,10 +16,17 @@
  */
 package org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure;
 
+import java.util.Iterator;
+import java.util.Map;
+
+import org.apache.pdfbox.cos.COSArray;
 import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSInteger;
 import org.apache.pdfbox.cos.COSName;
-import org.apache.pdfbox.pdmodel.common.COSObjectable;
+import org.apache.pdfbox.cos.COSObject;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
 
 /**
  * A structure element.
@@ -27,18 +34,22 @@ import org.apache.pdfbox.pdmodel.common.
  * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
  * @version $Revision: 1.3 $
  */
-public class PDStructureElement implements COSObjectable
+public class PDStructureElement extends PDStructureNode
 {
-    private COSDictionary dictionary;
+    public static final String TYPE = "StructElem";
+
 
     /**
-     * Default Constructor.
+     * Constructor with required values.
      *
+     * @param structureType the structure type
+     * @param parent the parent structure node
      */
-    public PDStructureElement()
+    public PDStructureElement(String structureType, PDStructureNode parent)
     {
-        dictionary = new COSDictionary();
-        dictionary.setName( COSName.TYPE, "StructElem" );
+        super(TYPE);
+        this.setStructureType(structureType);
+        this.setParent(parent);
     }
 
     /**
@@ -48,26 +59,500 @@ public class PDStructureElement implemen
      */
     public PDStructureElement( COSDictionary dic )
     {
-        dictionary = dic;
+        super(dic);
     }
 
+
     /**
-     * Convert this standard java object to a COS object.
-     *
-     * @return The cos object that matches this Java object.
+     * Returns the structure type (S).
+     * 
+     * @return the structure type
      */
-    public COSBase getCOSObject()
+    public String getStructureType()
     {
-        return dictionary;
+        return this.getCOSDictionary().getNameAsString("S");
     }
 
     /**
-     * Get the low level dictionary that this object wraps.
-     *
-     * @return The cos dictionary that matches this Java object.
+     * Sets the structure type (S).
+     * 
+     * @param structureType the structure type
+     */
+    public void setStructureType(String structureType)
+    {
+        this.getCOSDictionary().setName("S", structureType);
+    }
+
+    /**
+     * Returns the parent in the structure hierarchy (P).
+     * 
+     * @return the parent in the structure hierarchy
+     */
+    public PDStructureNode getParent()
+    {
+        COSDictionary p = (COSDictionary) this.getCOSDictionary()
+            .getDictionaryObject(COSName.P);
+        if (p == null)
+        {
+            return null;
+        }
+        return PDStructureNode.create((COSDictionary) p);
+    }
+
+    /**
+     * Sets the parent in the structure hierarchy (P).
+     * 
+     * @param structureNode the parent in the structure hierarchy
+     */
+    public void setParent(PDStructureNode structureNode)
+    {
+        this.getCOSDictionary().setItem(COSName.P, structureNode);
+    }
+
+    /**
+     * Returns the element identifier (ID).
+     * 
+     * @return the element identifier
+     */
+    public String getElementIdentifier()
+    {
+        return this.getCOSDictionary().getString("ID");
+    }
+
+    /**
+     * Sets the element identifier (ID).
+     * 
+     * @param id the element identifier
+     */
+    public void setElementIdentifier(String id)
+    {
+        this.getCOSDictionary().setString("ID", id);
+    }
+
+    /**
+     * Returns the page on which some or all of the content items designated by
+     *  the K entry shall be rendered (Pg).
+     * 
+     * @return the page on which some or all of the content items designated by
+     *  the K entry shall be rendered
+     */
+    public PDPage getPage()
+    {
+        COSDictionary pageDic = (COSDictionary) this.getCOSDictionary()
+            .getDictionaryObject("Pg");
+        if (pageDic == null)
+        {
+            return null;
+        }
+        return new PDPage(pageDic);
+    }
+
+    /**
+     * Sets the page on which some or all of the content items designated by
+     *  the K entry shall be rendered (Pg).
+     * @param page the page on which some or all of the content items designated
+     *  by the K entry shall be rendered.
+     */
+    public void setPage(PDPage page)
+    {
+        this.getCOSDictionary().setItem("Pg", page);
+    }
+
+    /**
+     * Returns the class names together with their revision numbers (C).
+     * 
+     * @return the class names
+     */
+    public Revisions<String> getClassNames()
+    {
+        String key = "C";
+        Revisions<String> classNames = new Revisions<String>();
+        COSBase c = this.getCOSDictionary().getDictionaryObject(key);
+        if (c instanceof COSName)
+        {
+            classNames.addObject(((COSName) c).getName(), 0);
+        }
+        if (c instanceof COSArray)
+        {
+            COSArray array = (COSArray) c;
+            Iterator<COSBase> it = array.iterator();
+            String className = null;
+            while (it.hasNext())
+            {
+                COSBase item = it.next();
+                if (item instanceof COSName)
+                {
+                    className = ((COSName) item).getName();
+                    classNames.addObject(className, 0);
+                }
+                else if (item instanceof COSInteger)
+                {
+                    classNames.setRevisionNumber(className,
+                        ((COSInteger) item).intValue());
+                }
+            }
+        }
+        return classNames;
+    }
+
+    /**
+     * Sets the class names together with their revision numbers (C).
+     * 
+     * @param classNames the class names
      */
-    public COSDictionary getCOSDictionary()
+    public void setClassNames(Revisions<String> classNames)
     {
-        return dictionary;
+        String key = "C";
+        if ((classNames.size() == 1) && (classNames.getRevisionNumber(0) == 0))
+        {
+            String className = classNames.getObject(0);
+            this.getCOSDictionary().setName(key, className);
+            return;
+        }
+        COSArray array = new COSArray();
+        for (int i = 0; i < classNames.size(); i++)
+        {
+            String className = classNames.getObject(i);
+            int revisionNumber = classNames.getRevisionNumber(i);
+            if (revisionNumber < 0)
+            {
+                // TODO throw Exception because revision number must be > -1?
+            }
+            array.add(COSName.getPDFName(className));
+            array.add(COSInteger.get(revisionNumber));
+        }
+        this.getCOSDictionary().setItem(key, array);
     }
+
+    /**
+     * Adds a class name.
+     * 
+     * @param className the class name
+     */
+    public void addClassName(String className)
+    {
+        String key = "C";
+        COSBase c = this.getCOSDictionary().getDictionaryObject(key);
+        COSArray array = null;
+        if (c instanceof COSArray)
+        {
+            array = (COSArray) c;
+        }
+        else
+        {
+            array = new COSArray();
+            if (c != null)
+            {
+                array.add(c);
+                array.add(COSInteger.get(0));
+            }
+        }
+        this.getCOSDictionary().setItem(key, array);
+        array.add(COSName.getPDFName(className));
+        array.add(COSInteger.get(this.getRevisionNumber()));
+    }
+
+    /**
+     * Removes a class name.
+     * 
+     * @param className the class name
+     */
+    public void removeClassName(String className)
+    {
+        String key = "C";
+        COSBase c = this.getCOSDictionary().getDictionaryObject(key);
+        COSName name = COSName.getPDFName(className);
+        if (c instanceof COSArray)
+        {
+            COSArray array = (COSArray) c;
+            array.remove(name);
+            if ((array.size() == 2) && (array.getInt(1) == 0))
+            {
+                this.getCOSDictionary().setItem(key, array.getObject(0));
+            }
+        }
+        else
+        {
+            COSBase directC = c;
+            if (c instanceof COSObject)
+            {
+                directC = ((COSObject) c).getObject();
+            }
+            if (name.equals(directC))
+            {
+                this.getCOSDictionary().setItem(key, null);
+            }
+        }
+    }
+
+    /**
+     * Returns the revision number (R).
+     * 
+     * @return the revision number
+     */
+    public int getRevisionNumber()
+    {
+        return this.getCOSDictionary().getInt(COSName.R, 0);
+    }
+
+    /**
+     * Sets the revision number (R).
+     * 
+     * @param revisionNumber the revision number
+     */
+    public void setRevisionNumber(int revisionNumber)
+    {
+        this.getCOSDictionary().setInt(COSName.R, revisionNumber);
+    }
+
+    /**
+     * Returns the title (T).
+     * 
+     * @return the title
+     */
+    public String getTitle()
+    {
+        return this.getCOSDictionary().getString("T");
+    }
+
+    /**
+     * Sets the title (T).
+     * 
+     * @param title the title
+     */
+    public void setTitle(String title)
+    {
+        this.getCOSDictionary().setString("T", title);
+    }
+
+    /**
+     * Returns the language (Lang).
+     * 
+     * @return the language
+     */
+    public String getLanguage()
+    {
+        return this.getCOSDictionary().getString("Lang");
+    }
+
+    /**
+     * Sets the language (Lang).
+     * 
+     * @param language the language
+     */
+    public void setLanguage(String language)
+    {
+        this.getCOSDictionary().setString("Lang", language);
+    }
+
+    /**
+     * Returns the alternate description (Alt).
+     * 
+     * @return the alternate description
+     */
+    public String getAlternateDescription()
+    {
+        return this.getCOSDictionary().getString("Alt");
+    }
+
+    /**
+     * Sets the alternate description (Alt).
+     * 
+     * @param alternateDescription the alternate description
+     */
+    public void setAlternateDescription(String alternateDescription)
+    {
+        this.getCOSDictionary().setString("Alt", alternateDescription);
+    }
+
+    /**
+     * Returns the expanded form (E).
+     * 
+     * @return the expanded form
+     */
+    public String getExpandedForm()
+    {
+        return this.getCOSDictionary().getString("E");
+    }
+
+    /**
+     * Sets the expanded form (E).
+     * 
+     * @param expandedForm the expanded form
+     */
+    public void setExpandedForm(String expandedForm)
+    {
+        this.getCOSDictionary().setString("E", expandedForm);
+    }
+
+    /**
+     * Returns the actual text (ActualText).
+     * 
+     * @return the actual text
+     */
+    public String getActualText()
+    {
+        return this.getCOSDictionary().getString("ActualText");
+    }
+
+    /**
+     * Sets the actual text (ActualText).
+     * 
+     * @param actualText the actual text
+     */
+    public void setActualText(String actualText)
+    {
+        this.getCOSDictionary().setString("ActualText", actualText);
+    }
+
+    /**
+     * Returns the standard structure type, the actual structure type is mapped
+     * to in the role map.
+     * 
+     * @return the standard structure type
+     */
+    public String getStandardStructureType()
+    {
+        String type = this.getStructureType();
+        String mappedType;
+        while (true)
+        {
+            mappedType = this.getRoleMap().get(type);
+            if ((mappedType == null) || type.equals(mappedType))
+            {
+                break;
+            }
+            type = mappedType;
+        }
+        return type;
+    }
+
+    /**
+     * Appends a marked-content sequence kid.
+     * 
+     * @param markedContent the marked-content sequence
+     */
+    public void appendKid(PDMarkedContent markedContent)
+    {
+        this.appendKid(COSInteger.get(markedContent.getMCID()));
+    }
+
+    /**
+     * Appends a marked-content reference kid.
+     * 
+     * @param markedContentReference the marked-content reference
+     */
+    public void appendKid(PDMarkedContentReference markedContentReference)
+    {
+        this.appendObjectableKid(markedContentReference);
+    }
+
+    /**
+     * Appends an object reference kid.
+     * 
+     * @param objectReference the object reference
+     */
+    public void appendKid(PDObjectReference objectReference)
+    {
+        this.appendObjectableKid(objectReference);
+    }
+
+    /**
+     * Inserts a marked-content identifier kid before a reference kid.
+     * 
+     * @param markedContentIdentifier the marked-content identifier
+     * @param refKid the reference kid
+     */
+    public void insertBefore(COSInteger markedContentIdentifier, Object refKid)
+    {
+        this.insertBefore(markedContentIdentifier, refKid);
+    }
+
+    /**
+     * Inserts a marked-content reference kid before a reference kid.
+     * 
+     * @param markedContentReference the marked-content reference
+     * @param refKid the reference kid
+     */
+    public void insertBefore(PDMarkedContentReference markedContentReference, Object refKid)
+    {
+        this.insertBefore(markedContentReference, refKid);
+    }
+
+    /**
+     * Inserts an object reference kid before a reference kid.
+     * 
+     * @param objectReference the object reference
+     * @param refKid the reference kid
+     */
+    public void insertBefore(PDObjectReference objectReference, Object refKid)
+    {
+        this.insertBefore(objectReference, refKid);
+    }
+
+    /**
+     * Removes a marked-content identifier kid.
+     * 
+     * @param markedContentIdentifier the marked-content identifier
+     */
+    public void removeKid(COSInteger markedContentIdentifier)
+    {
+        this.removeKid((COSBase) markedContentIdentifier);
+    }
+
+    /**
+     * Removes a marked-content reference kid.
+     * 
+     * @param markedContentReference the marked-content reference
+     */
+    public void removeKid(PDMarkedContentReference markedContentReference)
+    {
+        this.removeObjectableKid(markedContentReference);
+    }
+
+    /**
+     * Removes an object reference kid.
+     * 
+     * @param objectReference the object reference
+     */
+    public void removeKid(PDObjectReference objectReference)
+    {
+        this.removeObjectableKid(objectReference);
+    }
+
+
+    /**
+     * Returns the structure tree root.
+     * 
+     * @return the structure tree root
+     */
+    private PDStructureTreeRoot getStructureTreeRoot()
+    {
+        PDStructureNode parent = this.getParent();
+        while (parent instanceof PDStructureElement)
+        {
+            parent = ((PDStructureElement) parent).getParent();
+        }
+        if (parent instanceof PDStructureTreeRoot)
+        {
+            return (PDStructureTreeRoot) parent;
+        }
+        return null;
+    }
+
+    /**
+     * Returns the role map.
+     * 
+     * @return the role map
+     */
+    private Map<String, String> getRoleMap()
+    {
+        PDStructureTreeRoot root = this.getStructureTreeRoot();
+        if (root != null)
+        {
+            return root.getRoleMap();
+        }
+        return null;
+    }
+
 }

Added: pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDStructureNode.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDStructureNode.java?rev=921062&view=auto
==============================================================================
--- pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDStructureNode.java (added)
+++ pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDStructureNode.java Tue Mar  9 19:18:33 2010
@@ -0,0 +1,428 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSInteger;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSObject;
+import org.apache.pdfbox.pdmodel.common.COSArrayList;
+import org.apache.pdfbox.pdmodel.common.COSObjectable;
+import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
+
+/**
+ * A node in the structure tree.
+ * 
+ * @author Koch
+ * @version $Revision: $
+ */
+public abstract class PDStructureNode implements COSObjectable
+{
+
+    /**
+     * Creates a node in the structure tree. Can be either a structure tree root,
+     *  or a structure element.
+     * 
+     * @param node the node dictionary
+     * @return the structure node
+     */
+    public static PDStructureNode create(COSDictionary node)
+    {
+        String type = node.getNameAsString(COSName.TYPE);
+        if ("StructTreeRoot".equals(type))
+        {
+            return new PDStructureTreeRoot(node);
+        }
+        if ((type == null) || "StructElem".equals(type))
+        {
+            return new PDStructureElement(node);
+        }
+        throw new IllegalArgumentException("Dictionary must not include a Type entry with a value that is neither StructTreeRoot nor StructElem.");
+    }
+
+
+    private COSDictionary dictionary;
+
+    protected COSDictionary getCOSDictionary()
+    {
+        return dictionary;
+    }
+
+    /**
+     * Constructor.
+     *
+     * @param type the type
+     */
+    protected PDStructureNode(String type)
+    {
+        this.dictionary = new COSDictionary();
+        this.dictionary.setName(COSName.TYPE, type);
+    }
+
+    /**
+     * Constructor for an existing structure node.
+     *
+     * @param dictionary The existing dictionary.
+     */
+    protected PDStructureNode(COSDictionary dictionary)
+    {
+        this.dictionary = dictionary;
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    public COSBase getCOSObject()
+    {
+        return this.dictionary;
+    }
+
+    /**
+     * Returns the type.
+     * 
+     * @return the type
+     */
+    public String getType()
+    {
+        return this.getCOSDictionary().getNameAsString(COSName.TYPE);
+    }
+
+    /**
+     * Returns a list of objects for the kids (K).
+     * 
+     * @return a list of objects for the kids
+     */
+    public List<Object> getKids()
+    {
+        List<Object> kidObjects = new ArrayList<Object>();
+        COSBase k = this.getCOSDictionary().getDictionaryObject("K");
+        if (k instanceof COSArray)
+        {
+            Iterator<COSBase> kids = ((COSArray) k).iterator();
+            while (kids.hasNext())
+            {
+                COSBase kid = kids.next();
+                Object kidObject = this.createObject(kid);
+                if (kidObject != null)
+                {
+                    kidObjects.add(kidObject);
+                }
+            }
+        }
+        else
+        {
+            Object kidObject = this.createObject(k);
+            if (kidObject != null)
+            {
+                kidObjects.add(kidObject);
+            }
+        }
+        return kidObjects;
+    }
+
+    /**
+     * Sets the kids (K).
+     * 
+     * @param kids the kids
+     */
+    public void setKids(List<Object> kids)
+    {
+        this.getCOSDictionary().setItem("K",
+            COSArrayList.converterToCOSArray(kids));
+    }
+
+    /**
+     * Appends a structure element kid.
+     * 
+     * @param structureElement the structure element
+     */
+    public void appendKid(PDStructureElement structureElement)
+    {
+        this.appendObjectableKid(structureElement);
+        structureElement.setParent(this);
+    }
+
+    /**
+     * Appends an objectable kid.
+     * 
+     * @param objectable the objectable
+     */
+    protected void appendObjectableKid(COSObjectable objectable)
+    {
+        if (objectable == null)
+        {
+            return;
+        }
+        this.appendKid(objectable.getCOSObject());
+    }
+
+    /**
+     * Appends a COS base kid.
+     * 
+     * @param object the COS base
+     */
+    protected void appendKid(COSBase object)
+    {
+        if (object == null)
+        {
+            return;
+        }
+        COSBase k = this.getCOSDictionary().getDictionaryObject("K");
+        if (k == null)
+        {
+            // currently no kid: set new kid as kids
+            this.getCOSDictionary().setItem("K", object);
+        }
+        else if (k instanceof COSArray)
+        {
+            // currently more than one kid: add new kid to existing array
+            COSArray array = (COSArray) k;
+            array.add(object);
+        }
+        else
+        {
+            // currently one kid: put current and new kid into array and set array as kids
+            COSArray array = new COSArray();
+            array.add(k);
+            array.add(object);
+            this.getCOSDictionary().setItem("K", array);
+        }
+    }
+
+    /**
+     * Inserts a structure element kid before a reference kid.
+     * 
+     * @param newKid the structure element
+     * @param refKid the reference kid
+     */
+    public void insertBefore(PDStructureElement newKid, Object refKid)
+    {
+        this.insertBefore((COSObjectable) newKid, refKid);
+    }
+
+    /**
+     * Inserts an objectable kid before a reference kid.
+     * 
+     * @param newKid the objectable
+     * @param refKid the reference kid
+     */
+    protected void insertBefore(COSObjectable newKid, Object refKid)
+    {
+        if (newKid == null)
+        {
+            return;
+        }
+        this.insertBefore(newKid.getCOSObject(), refKid);
+    }
+
+    /**
+     * Inserts an COS base kid before a reference kid.
+     * 
+     * @param newKid the COS base
+     * @param refKid the reference kid
+     */
+    protected void insertBefore(COSBase newKid, Object refKid)
+    {
+        if ((newKid == null) || (refKid == null))
+        {
+            return;
+        }
+        COSBase k = this.getCOSDictionary().getDictionaryObject("K");
+        if (k == null)
+        {
+            return;
+        }
+        COSBase refKidBase = null;
+        if (refKid instanceof COSObjectable)
+        {
+            refKidBase = ((COSObjectable) refKid).getCOSObject();
+        }
+        else if (refKid instanceof COSInteger)
+        {
+            refKidBase = (COSInteger) refKid;
+        }
+        if (k instanceof COSArray)
+        {
+            COSArray array = (COSArray) k;
+            int refIndex = array.indexOfObject(refKidBase);
+            array.add(refIndex, newKid.getCOSObject());
+        }
+        else
+        {
+            boolean onlyKid = k.equals(refKidBase);
+            if (!onlyKid && (k instanceof COSObject))
+            {
+                COSBase kObj = ((COSObject) k).getObject();
+                onlyKid = kObj.equals(refKidBase);
+            }
+            if (onlyKid)
+            {
+                COSArray array = new COSArray();
+                array.add(newKid);
+                array.add(refKidBase);
+                this.getCOSDictionary().setItem("K", array);
+            }
+        }
+    }
+
+    /**
+     * Removes a structure element kid.
+     * 
+     * @param structureElement the structure element
+     * @return <code>true</code> if the kid was removed, <code>false</code> otherwise
+     */
+    public boolean removeKid(PDStructureElement structureElement)
+    {
+        boolean removed = this.removeObjectableKid(structureElement);
+        if (removed)
+        {
+            structureElement.setParent(null);
+        }
+        return removed;
+    }
+
+    /**
+     * Removes an objectable kid.
+     * 
+     * @param objectable the objectable
+     * @return <code>true</code> if the kid was removed, <code>false</code> otherwise
+     */
+    protected boolean removeObjectableKid(COSObjectable objectable)
+    {
+        if (objectable == null)
+        {
+            return false;
+        }
+        return this.removeKid(objectable.getCOSObject());
+    }
+
+    /**
+     * Removes a COS base kid.
+     * 
+     * @param object the COS base
+     * @return <code>true</code> if the kid was removed, <code>false</code> otherwise
+     */
+    protected boolean removeKid(COSBase object)
+    {
+        if (object == null)
+        {
+            return false;
+        }
+        COSBase k = this.getCOSDictionary().getDictionaryObject("K");
+        if (k == null)
+        {
+            // no kids: objectable is not a kid
+            return false;
+        }
+        else if (k instanceof COSArray)
+        {
+            // currently more than one kid: remove kid from existing array
+            COSArray array = (COSArray) k;
+            boolean removed = array.removeObject(object);
+            // if now only one kid: set remaining kid as kids
+            if (array.size() == 1)
+            {
+                this.getCOSDictionary().setItem("K", array.getObject(0));
+            }
+            return removed;
+        }
+        else
+        {
+            // currently one kid: if current kid equals given object, remove kids entry
+            boolean onlyKid = k.equals(object);
+            if (!onlyKid && (k instanceof COSObject))
+            {
+                COSBase kObj = ((COSObject) k).getObject();
+                onlyKid = kObj.equals(object);
+            }
+            if (onlyKid)
+            {
+                this.getCOSDictionary().setItem("K", null);
+                return true;
+            }
+            return false;
+        }
+    }
+
+    /**
+     * Creates an object for a kid of this structure node.
+     * The type of object depends on the type of the kid. It can be
+     * <ul>
+     * <li>a {@link PDStructureElement},</li>
+     * <li>a {@link PDAnnotation},</li>
+     * <li>a {@link PDXObject},</li>
+     * <li>a {@link PDMarkedContentReference}</li>
+     * <li>a {@link Integer}</li>
+     * </ul>
+     * 
+     * @param kid the kid
+     * @return the object
+     */
+    protected Object createObject(COSBase kid)
+    {
+        COSDictionary kidDic = null;
+        if (kid instanceof COSDictionary)
+        {
+            kidDic = (COSDictionary) kid;
+        }
+        else if (kid instanceof COSObject)
+        {
+            COSBase base = ((COSObject) kid).getObject();
+            if (base instanceof COSDictionary)
+            {
+                kidDic = (COSDictionary) base;
+            }
+        }
+        if (kidDic != null)
+        {
+            String type = kidDic.getNameAsString("Type");
+            if ((type == null) || PDStructureElement.TYPE.equals(type))
+            {
+                // A structure element dictionary denoting another structure
+                // element
+                return new PDStructureElement(kidDic);
+            }
+            else if (PDObjectReference.TYPE.equals(type))
+            {
+                // An object reference dictionary denoting a PDF object
+                return new PDObjectReference(kidDic);
+            }
+            else if ("MCR".equals(type))
+            {
+                // A marked-content reference dictionary denoting a
+                // marked-content sequence
+                return new PDMarkedContentReference(kidDic);
+            }
+        }
+        else if (kid instanceof COSInteger)
+        {
+            // An integer marked-content identifier denoting a
+            // marked-content sequence
+            COSInteger mcid = (COSInteger) kid;
+            return mcid.intValue();
+        }
+        return null;
+    }
+
+}

Modified: pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDStructureTreeRoot.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDStructureTreeRoot.java?rev=921062&r1=921061&r2=921062&view=diff
==============================================================================
--- pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDStructureTreeRoot.java (original)
+++ pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/PDStructureTreeRoot.java Tue Mar  9 19:18:33 2010
@@ -16,10 +16,14 @@
  */
 package org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure;
 
+import java.io.IOException;
+import java.util.Hashtable;
+import java.util.Map;
+
 import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSDictionary;
-import org.apache.pdfbox.cos.COSName;
-import org.apache.pdfbox.pdmodel.common.COSObjectable;
+import org.apache.pdfbox.pdmodel.common.COSDictionaryMap;
+import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
 
 /**
  * A root of a structure tree.
@@ -27,9 +31,11 @@ import org.apache.pdfbox.pdmodel.common.
  * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
  * @version $Revision: 1.2 $
  */
-public class PDStructureTreeRoot implements COSObjectable
+public class PDStructureTreeRoot extends PDStructureNode
 {
-    private COSDictionary dictionary;
+
+    public static final String TYPE = "StructTreeRoot";
+
 
     /**
      * Default Constructor.
@@ -37,8 +43,7 @@ public class PDStructureTreeRoot impleme
      */
     public PDStructureTreeRoot()
     {
-        dictionary = new COSDictionary();
-        dictionary.setName( COSName.TYPE, "StructTreeRoot" );
+        super(TYPE);
     }
 
     /**
@@ -48,26 +53,82 @@ public class PDStructureTreeRoot impleme
      */
     public PDStructureTreeRoot( COSDictionary dic )
     {
-        dictionary = dic;
+        super(dic);
     }
 
+
     /**
-     * Convert this standard java object to a COS object.
-     *
-     * @return The cos object that matches this Java object.
+     * Returns the ID tree.
+     * 
+     * @return the ID tree
      */
-    public COSBase getCOSObject()
+    public PDNameTreeNode getIDTree()
     {
-        return dictionary;
+        COSDictionary idTreeDic = (COSDictionary) this.getCOSDictionary()
+            .getDictionaryObject("IDTree");
+        if (idTreeDic != null)
+        {
+            return new PDNameTreeNode(idTreeDic, PDStructureElement.class);
+        }
+        return null;
     }
 
     /**
-     * Get the low level dictionary that this object wraps.
-     *
-     * @return The cos dictionary that matches this Java object.
+     * Sets the ID tree.
+     * 
+     * @param idTree the ID tree
+     */
+    public void setIDTree(PDNameTreeNode idTree)
+    {
+        this.getCOSDictionary().setItem("IDTree", idTree);
+    }
+
+    /**
+     * Returns the next key in the parent tree.
+     * 
+     * @return the next key in the parent tree
      */
-    public COSDictionary getCOSDictionary()
+    public int getParentTreeNextKey()
     {
-        return dictionary;
+        return this.getCOSDictionary().getInt("ParentTreeNextKey");
     }
+
+    /**
+     * Returns the role map.
+     * 
+     * @return the role map
+     */
+    @SuppressWarnings("unchecked")
+    public Map<String, String> getRoleMap()
+    {
+        COSBase rm = this.getCOSDictionary().getDictionaryObject("RoleMap");
+        if (rm instanceof COSDictionary)
+        {
+            try
+            {
+                return COSDictionaryMap.convertBasicTypesToMap((COSDictionary) rm);
+            }
+            catch (IOException e)
+            {
+                e.printStackTrace();
+            }
+        }
+        return new Hashtable<String, String>();
+    }
+
+    /**
+     * Sets the role map.
+     * 
+     * @param roleMap the role map
+     */
+    public void setRoleMap(Map<String, String> roleMap)
+    {
+        COSDictionary rmDic = new COSDictionary();
+        for (String key : roleMap.keySet())
+        {
+            rmDic.setName(key, roleMap.get(key));
+        }
+        this.getCOSDictionary().setItem("RoleMap", rmDic);
+    }
+
 }

Added: pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/Revisions.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/Revisions.java?rev=921062&view=auto
==============================================================================
--- pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/Revisions.java (added)
+++ pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/logicalstructure/Revisions.java Tue Mar  9 19:18:33 2010
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * 
+ * @author Koch
+ * @version $Revision: $
+ *
+ * @param <T> the type of object to store the revision numbers with
+ */
+public class Revisions<T>
+{
+
+    private List<T> objects;
+    private List<Integer> revisionNumbers;
+
+    private List<T> getObjects()
+    {
+        if (this.objects == null)
+        {
+            this.objects = new ArrayList<T>();
+        }
+        return this.objects;
+    }
+
+    private List<Integer> getRevisionNumbers()
+    {
+        if (this.revisionNumbers == null)
+        {
+            this.revisionNumbers = new ArrayList<Integer>();
+        }
+        return this.revisionNumbers;
+    }
+
+
+    /**
+     * 
+     */
+    public Revisions()
+    {
+    }
+
+
+    /**
+     * Returns the object at the specified position.
+     * 
+     * @param index the position
+     * @return the object
+     * @throws IndexOutOfBoundsException if the index is out of range
+     */
+    public T getObject(int index) throws IndexOutOfBoundsException
+    {
+        return this.getObjects().get(index);
+    }
+
+    /**
+     * Returns the revision number at the specified position.
+     * 
+     * @param index the position
+     * @return the revision number
+     * @throws IndexOutOfBoundsException if the index is out of range
+     */
+    public int getRevisionNumber(int index) throws IndexOutOfBoundsException
+    {
+        return this.getRevisionNumbers().get(index);
+    }
+
+    /**
+     * Adds an object with a specified revision number.
+     * 
+     * @param object the object
+     * @param revisionNumber the revision number
+     */
+    protected void addObject(T object, int revisionNumber)
+    {
+        this.getObjects().add(object);
+        this.getRevisionNumbers().add(revisionNumber);
+    }
+
+    /**
+     * Sets the revision number of a specified object.
+     * 
+     * @param object the object
+     * @param revisionNumber the revision number
+     */
+    protected void setRevisionNumber(T object, int revisionNumber)
+    {
+        int index = this.getObjects().indexOf(object);
+        if (index > -1)
+        {
+            this.getRevisionNumbers().set(index, revisionNumber);
+        }
+    }
+
+    /**
+     * Returns the size.
+     * 
+     * @return the size
+     */
+    public int size()
+    {
+        return this.getObjects().size();
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    public String toString()
+    {
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < this.getObjects().size(); i++)
+        {
+            if (i > 0)
+            {
+                sb.append("; ");
+            }
+            sb.append("object=").append(this.getObjects().get(i))
+                .append(", revisionNumber=").append(this.getRevisionNumber(i));
+        }
+        return sb.toString();
+    }
+
+}

Added: pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/markedcontent/PDMarkedContent.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/markedcontent/PDMarkedContent.java?rev=921062&view=auto
==============================================================================
--- pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/markedcontent/PDMarkedContent.java (added)
+++ pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/markedcontent/PDMarkedContent.java Tue Mar  9 19:18:33 2010
@@ -0,0 +1,174 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.pdmodel.documentinterchange.markedcontent;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject;
+import org.apache.pdfbox.util.TextPosition;
+
+/**
+ * A marked content.
+ * 
+ * @author Koch
+ * @version $Revision: $
+ */
+public class PDMarkedContent
+{
+
+    private String tag;
+    private COSDictionary properties;
+    private List<Object> contents;
+
+
+    /**
+     * Creates a new marked content object.
+     * 
+     * @param tag the tag
+     * @param properties the properties
+     */
+    public PDMarkedContent(COSName tag, COSDictionary properties)
+    {
+        this.tag = tag == null ? null : tag.getName();
+        this.properties = properties;
+        this.contents = new ArrayList<Object>();
+    }
+
+
+    /**
+     * Gets the tag.
+     * 
+     * @return the tag
+     */
+    public String getTag()
+    {
+        return this.tag;
+    }
+
+    /**
+     * Gets the properties.
+     * 
+     * @return the properties
+     */
+    public COSDictionary getProperties()
+    {
+        return this.properties;
+    }
+
+    /**
+     * Gets the marked-content identifier.
+     * 
+     * @return the marked-content identifier
+     */
+    public int getMCID()
+    {
+        return this.getProperties() == null ? null :
+            this.getProperties().getInt("MCID");
+    }
+
+    /**
+     * Gets the language (Lang).
+     * 
+     * @return the language
+     */
+    public String getLanguage()
+    {
+        return this.getProperties() == null ? null :
+            this.getProperties().getNameAsString("Lang");
+    }
+
+    /**
+     * Gets the actual text (ActualText).
+     * 
+     * @return the actual text
+     */
+    public String getActualText()
+    {
+        return this.getProperties() == null ? null :
+            this.getProperties().getString("ActualText");
+    }
+
+    /**
+     * Gets the alternate description (Alt).
+     * 
+     * @return the alternate description
+     */
+    public String getAlternateDescription()
+    {
+        return this.getProperties() == null ? null :
+            this.getProperties().getString("Alt");
+    }
+
+    /**
+     * Gets the contents of the marked content sequence. Can be
+     * <ul>
+     *   <li>{@link TextPosition},</li>
+     *   <li>{@link PDMarkedContent}, or</li>
+     *   <li>{@link PDXObject}.</li>
+     * </ul>
+     * 
+     * @return the contents of the marked content sequence
+     */
+    public List<Object> getContents()
+    {
+        return this.contents;
+    }
+
+    /**
+     * Adds a text position to the contents.
+     * 
+     * @param text the text position
+     */
+    public void addText(TextPosition text)
+    {
+        this.getContents().add(text);
+    }
+
+    /**
+     * Adds a marked content to the contents.
+     * 
+     * @param markedContent the marked content
+     */
+    public void addMarkedContent(PDMarkedContent markedContent)
+    {
+        this.getContents().add(markedContent);
+    }
+
+    /**
+     * Adds an XObject to the contents.
+     * 
+     * @param xobject the XObject
+     */
+    public void addXObject(PDXObject xobject)
+    {
+        this.getContents().add(xobject);
+    }
+
+
+    @Override
+    public String toString()
+    {
+        StringBuilder sb = new StringBuilder("tag=").append(this.tag)
+            .append(", properties=").append(this.properties);
+        sb.append(", contents=").append(this.contents);
+        return sb.toString();
+    }
+
+}

Added: pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/markedcontent/package.html
URL: http://svn.apache.org/viewvc/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/markedcontent/package.html?rev=921062&view=auto
==============================================================================
--- pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/markedcontent/package.html (added)
+++ pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/documentinterchange/markedcontent/package.html Tue Mar  9 19:18:33 2010
@@ -0,0 +1,26 @@
+<!--
+ ! Licensed to the Apache Software Foundation (ASF) under one or more
+ ! contributor license agreements.  See the NOTICE file distributed with
+ ! this work for additional information regarding copyright ownership.
+ ! The ASF licenses this file to You under the Apache License, Version 2.0
+ ! (the "License"); you may not use this file except in compliance with
+ ! the License.  You may obtain a copy of the License at
+ !
+ !      http://www.apache.org/licenses/LICENSE-2.0
+ !
+ ! Unless required by applicable law or agreed to in writing, software
+ ! distributed under the License is distributed on an "AS IS" BASIS,
+ ! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ! See the License for the specific language governing permissions and
+ ! limitations under the License.
+ !-->
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
+<html>
+<head>
+
+</head>
+<body>
+The marked content package provides a mechanism for modeling marked-content
+sequences.
+</body>
+</html>

Added: pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java?rev=921062&view=auto
==============================================================================
--- pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java (added)
+++ pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java Tue Mar  9 19:18:33 2010
@@ -0,0 +1,265 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.util;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Stack;
+
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
+import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject;
+
+/**
+ * This is an stream engine to extract the marked content of a pdf.
+ * @author koch
+ * @version $Revision$
+ */
+public class PDFMarkedContentExtractor extends PDFStreamEngine
+{
+    private boolean suppressDuplicateOverlappingText = true;
+    private List<PDMarkedContent> markedContents = new ArrayList<PDMarkedContent>();
+    private Stack<PDMarkedContent> currentMarkedContents = new Stack<PDMarkedContent>();
+
+    private Map<String, List<TextPosition>> characterListMapping =
+        new HashMap<String, List<TextPosition>>();
+
+    /**
+     * encoding that text will be written in (or null).
+     */
+    protected String outputEncoding; 
+
+    /**
+     * The normalizer is used to remove text ligatures/presentation forms
+     * and to correct the direction of right to left text, such as Arabic and Hebrew.
+     */
+    private TextNormalize normalize = null;
+
+    /**
+     * Instantiate a new PDFTextStripper object.  This object will load properties from
+     * Resources/PDFTextStripper.properties and will not do anything special to 
+     * convert the text to a more encoding-specific output.  
+     * @throws IOException If there is an error loading the properties.
+     */
+    public PDFMarkedContentExtractor() throws IOException
+    {
+        super( ResourceLoader.loadProperties( "Resources/PDFMarkedContentExtractor.properties", true ) );
+        this.outputEncoding = null;
+        this.normalize = new TextNormalize(this.outputEncoding);
+    }
+
+
+    /**
+     * Instantiate a new PDFTextStripper object.  Loading all of the operator mappings
+     * from the properties object that is passed in.  Does not convert the text
+     * to more encoding-specific output.
+     *
+     * @param props The properties containing the mapping of operators to PDFOperator
+     * classes.
+     *
+     * @throws IOException If there is an error reading the properties.
+     */
+    public PDFMarkedContentExtractor( Properties props ) throws IOException
+    {
+        super( props );
+        this.outputEncoding = null;
+        this.normalize = new TextNormalize(this.outputEncoding);
+    }
+    /**
+     * Instantiate a new PDFTextStripper object. This object will load properties from
+     * Resources/PDFTextStripper.properties and will apply encoding-specific
+     * conversions to the output text.  
+     *
+     * @param encoding The encoding that the output will be written in.
+     *
+     * @throws IOException If there is an error reading the properties.
+     */
+    public PDFMarkedContentExtractor( String encoding ) throws IOException
+    {
+        super( ResourceLoader.loadProperties( "Resources/PDFMarkedContentExtractor.properties", true ));
+        this.outputEncoding = encoding;
+        this.normalize = new TextNormalize(this.outputEncoding);
+    }
+
+
+    /**
+     * This will determine of two floating point numbers are within a specified variance.
+     *
+     * @param first The first number to compare to.
+     * @param second The second number to compare to.
+     * @param variance The allowed variance.
+     */
+    private boolean within( float first, float second, float variance )
+    {
+        return second > first - variance && second < first + variance;
+    }
+
+
+    public void beginMarkedContentSequence(COSName tag, COSDictionary properties)
+    {
+        PDMarkedContent markedContent = new PDMarkedContent(tag, properties);
+        if (this.currentMarkedContents.isEmpty())
+        {
+            this.markedContents.add(markedContent);
+        }
+        else
+        {
+            PDMarkedContent currentMarkedContent =
+                this.currentMarkedContents.peek();
+            if (currentMarkedContent != null)
+            {
+                currentMarkedContent.addMarkedContent(markedContent);
+            }
+        }
+        this.currentMarkedContents.push(markedContent);
+    }
+
+    public void endMarkedContentSequence()
+    {
+        if (!this.currentMarkedContents.isEmpty())
+        {
+            this.currentMarkedContents.pop();
+        }
+    }
+
+    public void xobject(PDXObject xobject)
+    {
+        if (!this.currentMarkedContents.isEmpty())
+        {
+            this.currentMarkedContents.peek().addXObject(xobject);
+        }
+    }
+
+
+    /**
+     * This will process a TextPosition object and add the
+     * text to the list of characters on a page.  It takes care of
+     * overlapping text.
+     *
+     * @param text The text to process.
+     */
+    protected void processTextPosition( TextPosition text )
+    {
+        boolean showCharacter = true;
+        if( this.suppressDuplicateOverlappingText )
+        {
+            showCharacter = false;
+            String textCharacter = text.getCharacter();
+            float textX = text.getX();
+            float textY = text.getY();
+            List<TextPosition> sameTextCharacters = this.characterListMapping.get( textCharacter );
+            if( sameTextCharacters == null )
+            {
+                sameTextCharacters = new ArrayList<TextPosition>();
+                this.characterListMapping.put( textCharacter, sameTextCharacters );
+            }
+
+            // RDD - Here we compute the value that represents the end of the rendered
+            // text.  This value is used to determine whether subsequent text rendered
+            // on the same line overwrites the current text.
+            //
+            // We subtract any positive padding to handle cases where extreme amounts
+            // of padding are applied, then backed off (not sure why this is done, but there
+            // are cases where the padding is on the order of 10x the character width, and
+            // the TJ just backs up to compensate after each character).  Also, we subtract
+            // an amount to allow for kerning (a percentage of the width of the last
+            // character).
+            //
+            boolean suppressCharacter = false;
+            float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
+            for( int i=0; i<sameTextCharacters.size() && textCharacter != null; i++ )
+            {
+                TextPosition character = (TextPosition)sameTextCharacters.get( i );
+                String charCharacter = character.getCharacter();
+                float charX = character.getX();
+                float charY = character.getY();
+                //only want to suppress
+
+                if( charCharacter != null &&
+                        //charCharacter.equals( textCharacter ) &&
+                        within( charX, textX, tolerance ) &&
+                        within( charY,
+                                textY,
+                                tolerance ) )
+                {
+                    suppressCharacter = true;
+                }
+            }
+            if( !suppressCharacter )
+            {
+                sameTextCharacters.add( text );
+                showCharacter = true;
+            }
+        }
+
+        if( showCharacter )
+        {
+            List<TextPosition> textList = new ArrayList<TextPosition>();
+
+            /* In the wild, some PDF encoded documents put diacritics (accents on
+             * top of characters) into a separate Tj element.  When displaying them
+             * graphically, the two chunks get overlayed.  With text output though,
+             * we need to do the overlay. This code recombines the diacritic with
+             * its associated character if the two are consecutive.
+             */ 
+            if(textList.isEmpty())
+            {
+                textList.add(text);
+            }
+            else
+            {
+                /* test if we overlap the previous entry.  
+                 * Note that we are making an assumption that we need to only look back
+                 * one TextPosition to find what we are overlapping.  
+                 * This may not always be true. */
+                TextPosition previousTextPosition = (TextPosition)textList.get(textList.size()-1);
+                if(text.isDiacritic() && previousTextPosition.contains(text))
+                {
+                    previousTextPosition.mergeDiacritic(text, this.normalize);
+                }
+                /* If the previous TextPosition was the diacritic, merge it into this
+                 * one and remove it from the list. */
+                else if(previousTextPosition.isDiacritic() && text.contains(previousTextPosition))
+                {
+                    text.mergeDiacritic(previousTextPosition, this.normalize);
+                    textList.remove(textList.size()-1);
+                    textList.add(text);
+                }
+                else
+                {
+                    textList.add(text);
+                }
+            }
+            if (!this.currentMarkedContents.isEmpty())
+            {
+                this.currentMarkedContents.peek().addText(text);
+            }
+        }
+    }
+
+
+    public List<PDMarkedContent> getMarkedContents()
+    {
+        return this.markedContents;
+    }
+
+}

Added: pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/BeginMarkedContentSequence.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/BeginMarkedContentSequence.java?rev=921062&view=auto
==============================================================================
--- pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/BeginMarkedContentSequence.java (added)
+++ pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/BeginMarkedContentSequence.java Tue Mar  9 19:18:33 2010
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.util.operator;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.util.PDFMarkedContentExtractor;
+import org.apache.pdfbox.util.PDFOperator;
+/**
+ * BMC : Begins a marked-content sequence.
+ * @author koch
+ * @version $Revision$
+ *
+ */
+public class BeginMarkedContentSequence extends OperatorProcessor
+{
+
+    /**
+     * {@inheritDoc} 
+     */
+    @Override
+    public void process(PDFOperator operator, List<COSBase> arguments)
+        throws IOException
+    {
+        COSName tag = null;
+        for (COSBase argument : arguments)
+        {
+            if (argument instanceof COSName)
+            {
+                tag = (COSName) argument;
+            }
+        }
+        if (this.context instanceof PDFMarkedContentExtractor)
+        {
+            ((PDFMarkedContentExtractor) this.context).beginMarkedContentSequence(tag, null);
+        }
+    }
+
+}

Added: pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/BeginMarkedContentSequenceWithProperties.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/BeginMarkedContentSequenceWithProperties.java?rev=921062&view=auto
==============================================================================
--- pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/BeginMarkedContentSequenceWithProperties.java (added)
+++ pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/BeginMarkedContentSequenceWithProperties.java Tue Mar  9 19:18:33 2010
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.util.operator;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.util.PDFMarkedContentExtractor;
+import org.apache.pdfbox.util.PDFOperator;
+/**
+ * BDC : Begins a marked-content sequence with property list.
+ *
+ * @author koch
+ * @version $Revision$
+ */
+public class BeginMarkedContentSequenceWithProperties extends OperatorProcessor
+{
+
+    /**
+     * {@inheritDoc}
+     */
+    public void process(PDFOperator operator, List<COSBase> arguments)
+        throws IOException
+    {
+        COSName tag = null;
+        COSDictionary properties = null;
+        for (COSBase argument : arguments)
+        {
+            if (argument instanceof COSName)
+            {
+                tag = (COSName) argument;
+            }
+            else if (argument instanceof COSDictionary)
+            {
+                properties = (COSDictionary) argument;
+            }
+        }
+        if (this.context instanceof PDFMarkedContentExtractor)
+        {
+            ((PDFMarkedContentExtractor) this.context).beginMarkedContentSequence(tag, properties);
+        }
+    }
+
+}

Added: pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/EndMarkedContentSequence.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/EndMarkedContentSequence.java?rev=921062&view=auto
==============================================================================
--- pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/EndMarkedContentSequence.java (added)
+++ pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/EndMarkedContentSequence.java Tue Mar  9 19:18:33 2010
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.util.operator;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.util.PDFMarkedContentExtractor;
+import org.apache.pdfbox.util.PDFOperator;
+
+/**
+ * EMC : Ends a marked-content sequence begun by BMC or BDC.
+ * @author koch
+ * @version $Revision: $
+ */
+public class EndMarkedContentSequence extends OperatorProcessor
+{
+
+    /**
+     * {@inheritDoc}
+     */
+    public void process(PDFOperator operator, List<COSBase> arguments)
+        throws IOException
+    {
+        if (this.context instanceof PDFMarkedContentExtractor)
+        {
+            ((PDFMarkedContentExtractor) this.context).endMarkedContentSequence();
+        }
+    }
+
+}

Modified: pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/Invoke.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/Invoke.java?rev=921062&r1=921061&r2=921062&view=diff
==============================================================================
--- pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/Invoke.java (original)
+++ pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/Invoke.java Tue Mar  9 19:18:33 2010
@@ -23,6 +23,7 @@ import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.PDResources;
 import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject;
 import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectForm;
+import org.apache.pdfbox.util.PDFMarkedContentExtractor;
 import org.apache.pdfbox.util.PDFOperator;
 
 import java.io.IOException;
@@ -54,6 +55,10 @@ public class Invoke extends OperatorProc
 
         Map xobjects = context.getXObjects();
         PDXObject xobject = (PDXObject) xobjects.get(name.getName());
+        if (this.context instanceof PDFMarkedContentExtractor)
+        {
+            ((PDFMarkedContentExtractor) this.context).xobject(xobject);
+        }
 
         if(xobject instanceof PDXObjectForm)
         {