Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Get All Links in a Document

Given a "normal document" in Google Docs/Drive (e.g. paragraphs, lists, tables) which contains external links scattered throughout the content, how do you compile a list of links present using Google Apps Script?

Specifically, I want to update all broken links in the document by searching for oldText in each url and replace it with newText in each url, but not the text.

I don't think the replacing text section of the Dev Documentation is what I need -- do I need to scan every element of the doc? Can I just editAsText and use an html regex? Examples would be appreciated.

like image 404
drzaus Avatar asked Sep 10 '13 19:09

drzaus


People also ask

How do I extract hyperlinks from a Word document?

Method 1: Utilize the “Find” Feature On the drop-down menu choose “Advanced Find” to open the “Find and Replace” dialog box. Next click “More” button and click “Format”. Choose “Style” to open “Find Style” box. Select “Hyperlink” and click “OK”.

How do I extract hyperlinks from a PDF?

Step-1: Install PyPDF2 on your local system by typing pip install PyPDF2 in the command shell. Step-2: Import PyPDF2. Step-3: Open the PDF in Binary mode and it recognizes links in the file. Step-4: Define a function to extract the hyperlink for a particular PDF page.

What does Alt F9 do in Word?

Alt+F9 will toggle the display of all fields in the document. The setting is a global one, which can also be set through File | Options | Advanced: Show document content: Show field codes instead of their values.


1 Answers

This is only mostly painful! Code is available as part of a gist.

ScreenshotYeah, I can't spell.

getAllLinks

Here's a utility function that scans the document for all LinkUrls, returning them in an array.

/**
 * Get an array of all LinkUrls in the document. The function is
 * recursive, and if no element is provided, it will default to
 * the active document's Body element.
 *
 * @param {Element} element The document element to operate on. 
 * .
 * @returns {Array}         Array of objects, vis
 *                              {element,
 *                               startOffset,
 *                               endOffsetInclusive, 
 *                               url}
 */
function getAllLinks(element) {
  var links = [];
  element = element || DocumentApp.getActiveDocument().getBody();
  
  if (element.getType() === DocumentApp.ElementType.TEXT) {
    var textObj = element.editAsText();
    var text = element.getText();
    var inUrl = false;
    for (var ch=0; ch < text.length; ch++) {
      var url = textObj.getLinkUrl(ch);
      if (url != null) {
        if (!inUrl) {
          // We are now!
          inUrl = true;
          var curUrl = {};
          curUrl.element = element;
          curUrl.url = String( url ); // grab a copy
          curUrl.startOffset = ch;
        }
        else {
          curUrl.endOffsetInclusive = ch;
        }          
      }
      else {
        if (inUrl) {
          // Not any more, we're not.
          inUrl = false;
          links.push(curUrl);  // add to links
          curUrl = {};
        }
      }
    }
    if (inUrl) {
      // in case the link ends on the same char that the element does
      links.push(curUrl); 
    }
  }
  else {
    var numChildren = element.getNumChildren();
    for (var i=0; i<numChildren; i++) {
      links = links.concat(getAllLinks(element.getChild(i)));
    }
  }

  return links;
}

findAndReplaceLinks

This utility builds on getAllLinks to do a find & replace function.

/**
 * Replace all or part of UrlLinks in the document.
 *
 * @param {String} searchPattern    the regex pattern to search for 
 * @param {String} replacement      the text to use as replacement
 *
 * @returns {Number}                number of Urls changed 
 */
function findAndReplaceLinks(searchPattern,replacement) {
  var links = getAllLinks();
  var numChanged = 0;
  
  for (var l=0; l<links.length; l++) {
    var link = links[l];
    if (link.url.match(searchPattern)) {
      // This link needs to be changed
      var newUrl = link.url.replace(searchPattern,replacement);
      link.element.setLinkUrl(link.startOffset, link.endOffsetInclusive, newUrl);
      numChanged++
    }
  }
  return numChanged;
}

Demo UI

To demonstrate the use of these utilities, here are a couple of UI extensions:

function onOpen() {
  // Add a menu with some items, some separators, and a sub-menu.
  DocumentApp.getUi().createMenu('Utils')
      .addItem('List Links', 'sidebarLinks')
      .addItem('Replace Link Text', 'searchReplaceLinks')
      .addToUi();
}

function searchReplaceLinks() {
  var ui = DocumentApp.getUi();
  var app = UiApp.createApplication()
                 .setWidth(250)
                 .setHeight(100)
                 .setTitle('Change Url text');
  var form = app.createFormPanel();
  var flow = app.createFlowPanel();
  flow.add(app.createLabel("Find: "));
  flow.add(app.createTextBox().setName("searchPattern"));
  flow.add(app.createLabel("Replace: "));
  flow.add(app.createTextBox().setName("replacement"));
  var handler = app.createServerHandler('myClickHandler');
  flow.add(app.createSubmitButton("Submit").addClickHandler(handler));
  form.add(flow);
  app.add(form);
  ui.showDialog(app);
}

// ClickHandler to close dialog
function myClickHandler(e) {
  var app = UiApp.getActiveApplication();

  app.close();
  return app;
}

function doPost(e) {
  var numChanged = findAndReplaceLinks(e.parameter.searchPattern,e.parameter.replacement);
  var ui = DocumentApp.getUi();
  var app = UiApp.createApplication();
  
  sidebarLinks(); // Update list

  var result = DocumentApp.getUi().alert(
      'Results',
      "Changed "+numChanged+" urls.",
      DocumentApp.getUi().ButtonSet.OK);
}


/**
 * Shows a custom HTML user interface in a sidebar in the Google Docs editor.
 */
function sidebarLinks() {
  var links = getAllLinks();
  var sidebar = HtmlService
          .createHtmlOutput()
          .setTitle('URL Links')
          .setWidth(350 /* pixels */);

  // Display list of links, url only.
  for (var l=0; l<links.length; l++) {
    var link = links[l];
    sidebar.append('<p>'+link.url);
  }
  
  DocumentApp.getUi().showSidebar(sidebar);
}
like image 185
Mogsdad Avatar answered Oct 09 '22 09:10

Mogsdad