Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Regex to match MediaWiki template and its parameters

I'm writing a simple Javascript to add a specific parameter to a specific template in article that is currently being edited.

Wikipedia Templates are structured in the following format:

 {{Template name|unnamed parameter|named parameter=some value|another parameter=[[target article|article name]]|parameter={{another template|another tamplate's parameter}}}}

One template can also be over more lines, for example:

{{Template 
|name=John
|surname=Smith
|pob=[[London|London, UK]]
}}

For further reference, please have a look at http://en.wikipedia.org/wiki/Help:Template

So firstly I'd like to match the entire template. I came over partial solution, that is:

document.editform.wpTextbox1.value.match(/\{\{template name((.|\n)*?)\}\}$/gmis)

However the problem is that it only matches text from the initial brackets till the closing brackets of the first nested template (first example).

In addition I'd like to fetch its parameters in an array form. So for the result, I'd like to get an array with parameters in specific order. Array( value of paramter pob, value of paramter name, value of parameter surname, value of parameter pod (in this case empty, because it was unset) )

I'd use that to clean the unstandardised formatting in some articles and add some new parameters.

Thank you!

like image 905
smihael Avatar asked Jun 30 '11 08:06

smihael


1 Answers

Write simple parser.

Solving this kind of problem by regexp is not right. It's the same as matching brackets - difficult to do with regexp. Regexps are not suitable for nested expressions in general.

Try something like that:

var parts = src.split(/(\{\{|\}\})/);
for (var i in parts) {
  if (parts[i] == '{{') // starting new (sub) template
  else if (parts[i] == '}}') // ending (sub) template
  else // content (or outside)
}

This is just pseudo code, as I'm in rush now, will update this code to be working...

UPDATE (9th August 2011)

var NO_TPL = 0, // outside any tpl - ignoring...
    IN_TPL = 1, // inside tpl
    IN_LIST = 3; // inside list of arguments

function parseWiki(src) {
  var tokens = src.split(/(\{\{|\}\}|\||=|\[\[|\]\])/),
      i = -1, end = tokens.length - 1,
      token, next, state = NO_TPL,
      work = [], workChain = [], stateChain = [];

  function trim(value) {
    return value.replace(/^\s*/, '').replace(/\s*$/, '');
  }

  // get next non empty token
  function getNext(next) {
    while (!next && i < end) next = trim(tokens[++i]);
    return next;
  }

  // go into tpl / list of arguments
  function goDown(newState, newWork, newWorkKey) {
    stateChain.push(state);
    workChain.push(work);

    if (newWorkKey) {
      work[newWorkKey] = newWork;
    } else {
      work.push(newWork);
    }

    work = newWork;
    state = newState;
  }

  // jump up from tpl / list of arguments
  function goUp() {
    work = workChain.pop();
    state = stateChain.pop();
  }

  // state machine
  while ((token = getNext())) {
    switch(state) {

      case IN_TPL:
        switch(token) {
          case '}}': goUp(); break;
          case '|': break;
          default:
            next = getNext();
            if (next != '=') throw "invalid";
            next = getNext();
            if (next == '[[') {
              goDown(IN_LIST, [], token);
            } else if (next == '{{') {
              goDown(IN_TPL, {id: getNext()}, token);
            } else {
              work[token] = next;
            }
        }
        break;

      case IN_LIST:
        switch(token) {
          case ']]': goUp(); break;
          case '|': break;
          default: work.push(token);
        }
        break;

      case NO_TPL:
        if (token == '{{') {
          next = getNext();
          goDown(IN_TPL, {id: next});
        }
        break;
    }
  }

  return work;
}

UNIT TESTS

describe('wikiTpl', function() {
  it('should do empty tpl', function() {
    expect(parseWiki('{{name}}'))
      .toEqual([{id: 'name'}]);
  });

  it('should ignore text outside from tpl', function() {
    expect(parseWiki(' abc {{name}} x y'))
    .toEqual([{id: 'name'}]);
  });

  it('should do simple param', function() {
    expect(parseWiki('{{tpl | p1= 2}}'))
      .toEqual([{id: 'tpl', p1: '2'}]);
  });

  it('should do list of arguments', function() {
    expect(parseWiki('{{name | a= [[1|two]]}}'))
      .toEqual([{id: 'name', a: ['1', 'two']}]);
  });

  it('should do param after list', function() {
    expect(parseWiki('{{name | a= [[1|two|3]] | p2= true}}'))
      .toEqual([{id: 'name', a: ['1', 'two', '3'], p2: 'true'}]);
  });

  it('should do more tpls', function() {
    expect(parseWiki('{{first | a= [[1|two|3]] }} odd test {{second | b= 2}}'))
      .toEqual([{id: 'first', a: ['1', 'two', '3']}, {id: 'second', b: '2'}]);
  });

  it('should allow nested tpl', function() {
    expect(parseWiki('{{name | a= {{nested | p1= 1}} }}'))
      .toEqual([{id: 'name', a: {id: 'nested', p1: '1'}}]);
  });
});

Note: I'm using Jasmine's syntax for these unit tests. You can easily run it using AngularJS which contains whole testing environment - check it out at http://angularjs.org

like image 192
Vojta Avatar answered Oct 13 '22 18:10

Vojta