Page slicing in Java

This technique gives a possible implementation for the CategoryBpPageSizeLimit and CategoryBpPageSizeUsable using JSR-188 and SAX (Simple API for XML) Java APIs.

The idea is to use an XML document analyzer based on events (like SAX) to process the elements taking into account that the number of bytes already processed in the current slice of a larger page cannot exceed a maximum.

SAX has been chosen instead of other options (like DOM) in order to save memory use during the analysis of a document.

If the resource to be splitted was a WML document, JSR-188 might be used to get the maximum decksize and split the resource in units with a maximum weight of "decksize". Unluckily, it is not easy to guess maximum weight for a page in CC/PP or UAProf for non-WML resources (XHTML-or-variants-based documents).

When trying documents as MWBP 1.0, with anchors referencing parts of the document, they do not exist in resulting sliced documents. A more refined version of the sample code should be created to achive this.

Pre-requisites:

A servlet container
JSR-188 API
SAX API

Implementation Steps:

Have a servlet container running
JSR-188 and SAX API installed and available for the application.
Create an application implemented by a servlet (SplitterExample.java) and an auxiliary class (MySplitter.java)

Code sample 1 part 1 (SplitterExample.java)

package org.splitter;
 
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.HashMap;
 
import javax.servlet.ServletConfig;
import javax.servlet.ServletContext;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
 
import ctic.ProfileParser;
 
 
/**
 * Servlet implementation class for Servlet: SplitterExample
 * This servlet shows an example of how works the class "MySplitter" with the
 * document "Mobile Web Best Practices 1.0 - W3C Working Draft 13 January 2006"
 */
public class SplitterExample extends javax.servlet.http.HttpServlet implements
    javax.servlet.Servlet
{
  /**
   * Internal field in charge of getting client's capabilities.
   */
  private ProfileParser profileParser;
  /**
   * Internal field to store the context servlet and perform forwarding request.
   */
  private ServletContext context;
  /**
   * Internal field to store the maximum size desired for the parts of the page.
   */
  private long maxSize;
  
  /**
   * Constant that indicates the name of the resource that will be splitted.
   */
  private final String BASE_NAME = "page.html";
  
  
  
  public SplitterExample()
  {
    super();
  }
  
  
  public void init(ServletConfig cfg) throws ServletException
  {
    profileParser = new ProfileParser();
    context = cfg.getServletContext();
  }
 
  
  protected void doGet(HttpServletRequest request, HttpServletResponse response)
      throws ServletException, IOException
  {
    doPost(request,response);
  }
 
  
  protected void doPost(HttpServletRequest request, HttpServletResponse response)
      throws ServletException, IOException
  {
    processRequest(request,response);
  }
  
  /**
   * This method in charge of process the client's request sending the right
   * resource partition.
   *
   * @param  request    object that encapsulates the client's request.
   * @param  response   object that encapsulates the server's response.
   */
  void processRequest(HttpServletRequest request, HttpServletResponse response)
  {
    long deckSize;
    String xhtmlAttr;
    HashMap attributes;
 
    try
    {
      // Get device attributes hash map 
      attributes = profileParser.getProfileAttr(request);
      
      // If information about the client is available ...
      if (attributes != null)
      {
        // Get the maximum deck size for WAP.
        xhtmlAttr = (String) (attributes.get("WmlDeckSize"));
        if (xhtmlAttr != null)
          deckSize = Long.parseLong(xhtmlAttr);
        
        // Establishes as maximum size the value of 4096 bytes instead of
        // deckSize because the original resource is a XHTML document. This
        // line can be commented if the original resource was a WML document.
        maxSize = 4096;
        
        // Split it!
        splitFile();
        
        // Compound and redirect to the first parted page.
        String ref = "/part_" + BASE_NAME.substring(0,BASE_NAME.lastIndexOf('.')) + "_0.html";
        context.getRequestDispatcher(ref).forward(request,response);
      }
      // If is null profile forward to the original resource ...
      else
      {
        String ref = "/" + BASE_NAME;
        context.getRequestDispatcher(ref).forward(request,response);
      }
    }
    catch (Exception e)
    {
      e.printStackTrace();
    }
  }
  
  /**
   * This method in charge of perform the split process of the resource.
   */
  void splitFile()
  {
    try
    {
      // SAX initialization.
      SAXParserFactory factory = SAXParserFactory.newInstance();
      SAXParser saxParser = factory.newSAXParser();
      
      File f = new File(context.getRealPath(BASE_NAME));
 
      // If resource exceeds the maximum size ...
      if (f.length() > maxSize)
      {
        // Create and configure "MySplitter" object.
        MySplitter splitter = new MySplitter(BASE_NAME);
        splitter.headerTag = "h2";
        splitter.maxFileSize = maxSize;
        
        // Parser!
        saxParser.parse(context.getRealPath("/") + BASE_NAME,splitter);
        
        // Save each part in different files ...
        for (int i = 0; i < splitter.getNumParts(); i++)
        {
          // File names starting with "part_".
          String ref = context.getRealPath("/") + "/part_";
          ref += BASE_NAME.substring(0,BASE_NAME.lastIndexOf('.'));
          ref += "_" + Integer.toString(i) + ".html";
          
          // Get the parted page and save to disk.
          FileOutputStream tempFile = new FileOutputStream(ref);
          tempFile.write(splitter.getText(i).toString().getBytes());
          tempFile.close();
        }
      }
    }
    catch (Exception e)
    {
      e.printStackTrace();
    }
  }
  
}

Code sample 1 part 2 (MySplitter.java)

package org.splitter;
 
import java.io.IOException;
import java.util.ListIterator;
import java.util.Stack;
import java.util.Vector;
 
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.DefaultHandler;
 
 
/**
 * Class that extends DefaultHandler for parser XHTML documents and performs
 * the process of split the resource into multiples parts when it is necesary.
 * (SAX Parser)
 * 
 * TODO: Regenerate the links associated with anchors; due to the splitter
 * process anchors does not works!
 */
public class MySplitter extends DefaultHandler
{
  /**
   * Internal field to store all parted pages in the splitter process.
   */
  private Vector vPart;
  /**
   * Internal field to store the offset into the document where the navigation
   * links must be inserted.
   */
  private int navPosition;
  /**
   * Internal field to store the common header for all parted pages.
   */
  private StringBuffer header;
  /**
   * Internal field to store the current part of the page.
   */
  private StringBuffer part;
  /**
   * Stack structure to store all opened tags in the splitter process, necesary
   * to know which tags must be closed and opened in the finalization and
   * creation of a part respectly.
   */
  private Stack tagStack;
  /**
   * Internal field to store the end position of the header and know the tags
   * that must be opened in the next parted page. 
   */
  private int indexHeader;
  /**
   * Internal field to specify the base name of the resource that must be
   * splitted.
   */
  private String baseName;
  
  
  /**
   * Public field to specify the maximum file size for each parted page. This
   * value should be lower than the desired because the check of file size is
   * performed after the end of a element.
   */
  public long maxFileSize = 4096;
  /**
   * Public field to specify the header tag that will form the common header for
   * all parts of the document. The common header comprises all the text between
   * the beginning and the first occurrence of the headerTag ("h2" by default).
   */
  public String headerTag = "h2";
  
  
  /**
   * Class constructor specifying the base name of the resource to split.
   */
  public MySplitter(String aBaseName)
  {
    // Initializes the internal fields.
    vPart = new Vector();
 
    part = new StringBuffer();
    header = new StringBuffer();
 
    indexHeader = 0;
    tagStack = new Stack();
 
    // Extract only the file name without extension.
    baseName = aBaseName.substring(0,aBaseName.lastIndexOf('.'));
  }
 
  
  public void startDocument()
  {
    // Add headers definition of XML and XHTML Basic DOCTYPE.
    part.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
    part.append("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML Basic 1.0//EN\" ");
    part.append("\"http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd\">");
  }
 
  
  public void endDocument()
  {
    // Add the last parted page to the vector.
    vPart.add(part);
 
    // Inserts the navigation text for ALL parted pages generated. 
    generateNavigation();
  }
 
  
  public void startElement(String namespaceURI, String lName, String qName, Attributes attrs) throws SAXException
  {
    StringBuffer buffer;
 
    try
    {
      // Forms the tag with the qualified name and all attributes ...
      buffer = new StringBuffer();
      buffer.append("<" + qName);
      
      for (int i = 0 ; i < attrs.getLength(); i++)
        buffer.append(" "+ attrs.getQName(i) + "=\"" + attrs.getValue(i).replaceAll("&","&amp;") +"\"");
      
      // Closed tags that cannot contains data between start and end of tag ...
      if ("img".equals(qName) || "area".equals(qName) || "br".equals(qName) ||
          "input".equals(qName) || "meta".equals(qName))
        buffer.append("/");
 
      buffer.append(">");
 
      // Appends the buffer to the current part.
      part.append(buffer);
      
      // Insert the new tag in the stack.
      tagStack.push(buffer);
 
      // Mark the navigation position (top navigation) when appears body tag ...
      if (qName.equals("body") == true)
        navPosition = part.length();
    }
    catch (Exception e)
    {
      throw new SAXException(e.getMessage());
    }
  }
  
  
  public void endElement(String namespaceURI, String sName, String qName)
      throws SAXException
  {
    try
    {
      // Removes current tag, due to end element.
      tagStack.pop();
 
      // Append only the closed tags that can be contains data between start and
      // end of tag ...
      if (!"img".equals(qName) && !"area".equals(qName) && !"br".equals(qName) &&
          !"input".equals(qName) && !"meta".equals(qName) && !"tbody".equals(qName))
        part.append("</" + qName + ">");
 
      // The common header comprises all the text between the beginning and the
      // first occurrence of the headerTag ("h2" by default). 
      if ((header.length() == 0) && (qName.equals(headerTag) == true))
      {
        header.append(part);
        // Stores the position for add the common header to the next pages.
        indexHeader = tagStack.size();
      }
    }
    catch (Exception e)
    {
      throw new SAXException(e.getMessage());
    }
 
    // Check current size after each end of tag.
    checkPageSize();
  }
  
  
  public void characters(char[] buf, int offset, int len)
  {
    // Store the content in the current parted page.
    part.append(new String(buf,offset,len));
  }
 
  
  public void error(SAXParseException e) throws SAXParseException
  {
    throw e;
  }
 
  
  /**
   * This method checks if the current part exceeds the maximum file size
   * specified; in this case completes the current part closing the opened tags
   * and creating a new part with the header stores previously.  
   */
  public void checkPageSize()
  {
    String qName;
    StringBuffer buffer = null;
    StringBuffer newText = null;
    int delimiter;
 
    
    try
    {
      // Check if exceeds the maximum size ...
      if (part.length() > maxFileSize)
      {
        // Get iterator AT THE END of the stack.
        ListIterator itr = tagStack.listIterator(tagStack.size());
 
        // Process ALL opened tags ...
        while (itr.hasPrevious())
        {
          // Stores current opened tag.
          buffer = (StringBuffer) itr.previous();
          
          // Obtains the delimiter for extract the qualified name of element.
          delimiter = buffer.toString().indexOf(' ');
          if (delimiter == -1)
            delimiter = buffer.toString().indexOf('>');
 
          // Extracts it and append the closed tad to the current part.
          qName = buffer.toString().substring(1, delimiter);
          part.append("</" + qName + ">");
        }
 
        // Stores in a new buffer the common header.
        newText = new StringBuffer();
        newText.append(header);
 
        // Adds ALL tags that requires to be opened ...
        for (int i = indexHeader; i < tagStack.size(); i++)
          newText.append(tagStack.elementAt(i));
 
        // Add the current parted page to the vector.
        vPart.add(part);
 
        // Creates a new part of the page and append the temp buffer.
        part = new StringBuffer();
        part.append(newText);
      }
    }
    catch (Exception e)
    {
      e.printStackTrace();
    }
  }
  
  /**
   * This method returns ALL the text of the parted page specified by index. 
   *
   * @param  index    the number of the parted page.
   * @return          the text of the specified page.
   */
  public StringBuffer getText(int index) throws IOException
  {
    return (StringBuffer)vPart.elementAt(index);
  }
 
  /**
   * This method returns the number of parted pages. 
   */
  public int getNumParts()
  {
    return vPart.size();
  }
  
  /**
   * This method generates ALL the necesary links for each parted pages. In
   * addition, inserts the text of the links into parted pages for X(HTML)
   * visualization.
   */
  void generateNavigation()
  {
    int i;
    String navHome, navEnd;
    StringBuffer nav;
    
    
    // There is more than 1 part ...
    if (vPart.size() > 1)
    {
      // Generate link to first part.
      navHome = getLink(0,"[Home]");
      // Generate link to last part.
      navEnd = getLink((vPart.size() - 1),"[End]");
 
      i = 0;
      // First part: links to last and next part.
      nav = new StringBuffer();
      nav.append(navEnd);
      nav.append(getLink((i + 1),"[Next]"));
      // Add links to first part.
      addNavigation(i,nav);
 
      // Process intermediate parts ...
      for (i = 1; i < (vPart.size() - 1); i++)
      {
        // Intermediate part: links to previous, first, last and next part.
        nav = new StringBuffer();
        nav.append(getLink((i - 1),"[Prv.]"));
        nav.append(navHome);
        nav.append(navEnd);
        nav.append(getLink((i + 1),"[Next]"));
        // Add links to i-th part.
        addNavigation(i,nav);
      }
      
      // Last part: links to previous and first part.
      nav = new StringBuffer();
      nav.append(getLink((i - 1),"[Prv.]"));
      nav.append(navHome);
      // Add links to last part.
      addNavigation(i,nav);
    }
  }
  
  /**
   * This method inserts the navigation text in the top of the parted page
   * specified by the param i. 
   *
   * @param  i    the number of the parted page.
   * @param  nav  the text of the navigation links.
   */
  void addNavigation(int i, StringBuffer nav)
  {
    ((StringBuffer) vPart.elementAt(i)).insert(navPosition,nav.toString());
  }
 
  /**
   * This method get the link refer to the parted page specified by the param i. 
   *
   * @param  i          the number of the parted page.
   * @param  baseName   the name of the original resource.
   * @param  label      a descriptive text for the link.
   * @return            the text of link in X(HTML).
   */
  String getLink(int i, String label)
  {
    // Generate link to i-th parted page.
    StringBuffer link = new StringBuffer();
    link.append("<a href=\"part_" + baseName + "_" + i + ".html\">" + label + "</a>");
    
    return link.toString();
  }
  
}

Back to BestPracticesList

CategoryJava CategoryXhtml CategoryBpPageSizeUsable