Javascript - for removing reduncancy in html



  • This is an interesting script I wrote for a university assignment a bit over a year ago for processing files on load.  This was to add links on a lot of the words to the next occurrence of the word, in that page or in the next (alphabetically).  Originally the links were simply in the html file, but somehow we decided to add them in at page-load time to avoid so much redundancy in the file.  This is because each word would be repeated twice within the tag, as well as the text, making the file about 4 times as big.  Of course, it made loading the page extremely slow.

    /*
     * helper function to change the two frames together after expanding the references.
     */
    function change2(docpage, n, word)
    {
       parent.documentframe.location=docpage + "#" + word + n;
       parent.circularframe.location= word + ".html";
    }

    /*
     * function to check if a letter is part of a word or not
     */
    function notPartOfWord(letter)
    {
      var punctuation = ' ,.?-=_+()!@#$%^&*~`<>[]{}%3B%27:/\"';
      for(i = 0; i < punctuation.length; i++){
        if(letter == punctuation.substr(i,1)){
          return true;
        }
      }
      return false;
    }

    /*
     * This function searches for the next occurrence of word in the supplied text after the positon numbered by start.
     * It assumes that the word and text are in lowercase.
     * It returns the index of the word's position, or -1 if not found
     */
    function findNextOccurrence(lcBodyText, lcWord, start)
    {
      var chopBodyText  = lcBodyText.substr(start);
      var i = -1;
      var chopSum = start;
      while(chopBodyText.length > 0){
        i = chopBodyText.indexOf(lcWord);
        //alert(i + lcWord + " choplength:" + chopBodyText.length);
        if (i < 0){
          //alert("no words found");
          return -1;//no words at all!
        }

        // skip anything inside an HTML tag
        if (chopBodyText.lastIndexOf(">", i) >= chopBodyText.lastIndexOf("<", i)) {
          // skip anything inside a <script> block
          if (chopBodyText.lastIndexOf("/script>", i) >= chopBodyText.lastIndexOf("<script", i)) {
            // skip anything inside a <a> block
            if (chopBodyText.lastIndexOf("/a>", i) >= chopBodyText.lastIndexOf("<a", i)) {
              //possible problem: <a> tag that doesn't have a href is not a link!
              // skip if the surrounding lettrs are part of a word
              if (notPartOfWord(chopBodyText.substr(i-1,1)) && notPartOfWord(chopBodyText.substr(i+lcWord.length,1))){
                //alert('passed');
                return i + chopSum;//a suitable occurance was found so no need to loop any more
              }
            }
          }
        }
      //alert("chopping off " + i + lcWord.length);
      chopBodyText  = chopBodyText.substr(i + lcWord.length);
      chopSum += i +lcWord.length;
      }
      //alert("out of chopBodyText");
      return -1;//if we run out of chopBodyText, then the word was not found

    }

    /*
     * This function searches for occurrences of a word (but not on the inside of tags) and puts tags around it
     * These are the tags that will turn it into a link that calls a javascript function for opening multiple pages in frames.
     */
    function doLinkWord(bodyText, word, thisPage, nextPage)
    {
     
      // find all occurences of the link word in the given text,
      // and add the correct <a> tags to them (we're not using a
      // regular expression search, because we want to filter out
      // matches that occur within HTML tags and script blocks, so
      // we have to do a little extra validation)
      var newText = "";
      var lcWord = word.toLowerCase();
      var lcBodyText = bodyText.toLowerCase();
      var wordCount = 0;
      var i = findNextOccurrence(lcBodyText, lcWord, 0);
      var nexti = i;
      var offset = 0;
      var StartTag;
      var EndTag = "</a>";
        
      while (bodyText.length > 0) {
        //alert("beginningi:" + i);
        if (i < 0) {        
          newText += bodyText;
          bodyText = "";      
        } else  {
          wordCount += 1;  
          offset = i + word.length;  
          StartTag = '<a name="' + word + wordCount + '" href="javascript:change2(\'';
          //check for more occurances
          nexti = findNextOccurrence(lcBodyText, lcWord, offset);
          if (nexti < 0){
              StartTag +=  nextPage + "', '1'";          
          } else  {
            StartTag += thisPage + "', '" + (wordCount + 1) + "'";
          }
          StartTag += ", '" + word + '\')">';
          newText += bodyText.substring(0, i) + StartTag + bodyText.substr(i, word.length) + EndTag;
          //alert("chopping:"+offset);
          bodyText = bodyText.substr(offset);
          //alert('bodytext:' + bodyText.length);
          lcBodyText = bodyText.toLowerCase();
          nexti -= offset;
          i = nexti;
        }
      }
      //alert("returning newText");
      return newText;
    }


    function addLinks(wordFilePairs, thisPage)
    {
      //alert("adding links");
      var bodyText = document.body.innerHTML;
      for (var i = 0; i < wordFilePairs.length; i++) {
        bodyText = doLinkWord(bodyText, wordFilePairs[i][0], thisPage, wordFilePairs[i][1]);
      }
      document.body.innerHTML = bodyText;
    }


    function generateDocList(docDataList, word)
    {
      for(var i = 0; i < docDataList.length; i++){
        entry = docDataList[i];
        for(var n = 1; n <= entry[1]; n++){
          document.write('<a name="' + entry[0] + n + '" href="javascript:change2(\'' + entry[0]+ "', '" + (n) + "', '" + word + '\')">' + entry[0] + ' ' + n + '</a>');
        }
      }
    }

     

    After one of other coders in my group pointed out that I was scanning through the entire document n times (where n was the number of different words in the document, pretty much) I replaced the main link-adding system with one that instead navigated the DOM tree and added the correct nodes where appropriate, and made use of dictionaries such that i only had to scan once.  It was actually quicker.

    By the way, addLinks was called as follows (to take a small mockup example, as the real stuff was waaay too long with a huge list of words (rather than just two like here) generated by a Java program, which also converted the documents to xhtml from Word Docs (and resulting in a file that was half not-really used CSS junk and half html tags all on one line)):

    <body dir="ltr"onload="javascript:addLinks([['links', 'usertraining.html'], ['quality', 'usertraining.html']],'userreference.html');parent.documentframe.location=parent.documentframe.location;"

    the parent.documentframe.location=itself hack was something important to stop infinite recursion or something... btw, while doing this assignment I was simultaneously learning html and javascript for the first time in another unit.

    Hopefully I'll find the DOM manipulation version some time.  It didn't seem to make it onto my computer or into version control, even though I'm sure it was in the final demo and submission... there are however 6 different versions of this one in various folders.

    Oh, and it didn't work in IE at all.  That took a while to get across to those in our group who weren't coding focussed.  Interestingly, some of them went on to create a game framework as a project, which I've ended up using in a game programming unit and simultaneously extending/cleaningup as a project for another unit... sigh.



  • Just a caution:  String.substr is junk and should never be used.  If you want to pull a single character out of a string, use String.charAt().

     When searching for a character in a string, it's also good to use String.indexOf() rather than iterating the characters yourself.



  • @mrprogguy said:

    Just a caution:  String.substr is junk and should never be used.  If you want to pull a single character out of a string, use String.charAt().

    What do you suggest using if you want to pull a longer substring out of a string? Multiple String.charAt()s and concatenate the results? Regular expressions?



  • I was often searching for a word, not a character, and used indexOf() and lastIndexOf() quite a lot.  Not that I want to justify anything... the whole exercise was rather pointless.

    I remembered that I made a second checkout of the repository cos I'd somehow messed up the first one.  However the older one has been updated more recently, so I thought they were the other way around (despite the newer one having a 2 in the folder name).
    Anyhow, I found another 6 copies of my script, and at least one is the DOM version...

     

    /*
     * helper function to change the two frames together after expanding the references.
     */
    function change2(docpage, n, word)
    {
      if(n == 0){
        parent.documentframe.location=docpage + "#";
        parent.circularframe.location='ablankpage.html';
      }
      else{
        parent.documentframe.location=docpage + "#" + word + n;
        parent.circularframe.location=word + ".html#" + docpage + n;
      }
    }


    function addLinks(wordMapping, thisPageName)
    {
      dft(document.documentElement, wordMapping, thisPageName);
      fixLastLinks(wordMapping);
    }


    function dft(node, wordMapping, thisPageName)
    {
      if (node.nodeType == Node.TEXT_NODE)
      {
       
        return splitNode(node,wordMapping, thisPageName);
      }
      else if (node.nodeType == Node.ELEMENT_NODE && node.hasChildNodes())
      {
        if(node.tagName.toLowerCase() == 'a')
        {
          return; //ignore <a> tags
        }
        var children = node.childNodes;
        for(var i=0; i < children.length; i++)
        {
          dft(children[i] ,wordMapping, thisPageName);     
        }
      }
    }

    function splitNode(node, wordMapping, thisPageName)
    {
      var text = node.nodeValue;
      var word = "";
      for(var i = 0; i < text.length; i++)
      {
        var letter = text[i];
        if ((letter < 'a'|| letter > 'z') && (letter < 'A'|| letter > 'Z'))
        {
          if (word.length > 0)
          {
            //we have come to a non-letter that is after some letters
            data = wordMapping[word.toLowerCase()];
            if(data)
            {
              var wordCount = data[1];
              if(! wordCount)
              {
                wordCount = 0;
              }
              wordCount += 1;
              data[1] = wordCount;
              var linkNode = document.createElement('a');
              linkNode.name = word.toLowerCase() + wordCount;
              linkNode.href = "javascript:change2('" + thisPageName + "', '" + (wordCount + 1) + "', '" + word.toLowerCase() + "')";
              node.splitText(i-word.length);
              node.nextSibling.splitText(word.length);
              var replacedNode = node.parentNode.replaceChild(linkNode,node.nextSibling);
              linkNode.appendChild(replacedNode);
              data[2] = linkNode; //after all processing the last linkNode for word will be here so we can change it to go to the next document
              return;//since the node has been shrunk to a size less than i
            }
            else
            {
              //word is not an index word so skip it
              word = "";
            }

          }
        }
        else
        {
          word += letter;
        }
      }  
    }

    function fixLastLinks(wordMapping)
    {
      for(var word in wordMapping)
      {
        var last = wordMapping[word][2];
        if(last)
        {
          last.href = "javascript:change2('" + wordMapping[word][0] + "', '" + 1 + "', '" + word.toLowerCase() + "')";
        }
      }
    }




    /*
     * This function generates the list of links to documents for each of the circular index pages when the pag loads
     */
    function generateDocList(docDataList, word)
    {
      for(var i = 0; i < docDataList.length; i++)
        {
          entry = docDataList[i];
          for(var n = 1; n <= entry[1]; n++)
          {
            document.write('<a name="' + entry[0] + n + '" href="javascript:void(parent.documentframe.location=\'' + entry[0] + '#' + word + n + '\')">' + entry[0] + ' ' + n + '</a> ');
          }
          if(i+1 < docDataList.length){
            document.write('</p><p>');
          }
      }
    }

     

    Still just as pointless as the original, but I managed to figure out how to process nodes by reading w3cschools and asking Google search for examples... I don't think this version worked in IE either.


Log in to reply