Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Valid RegEx according to online test tools, not getting any matches when reading file in browser

I've designed this regex with several online tools and with help of the community:

https://regex101.com/r/hJ4pD5/1

(\s[A-Z]\.).+?(?=(\s[A-Z]\.)|(\W?(Answer:)\W?))

The goal is to extract all alternatives to a question. According to regexr and regex101 this is a valid Javascript regex which works well with the test data(pastebin):

1. Question goes here:
A. Answer one
B. Answer two 
C. Answer three D. Not indented Answer
Answer: B is correct

Expected matches should be:

"A. Answer one", "B. Answer two", "C. Answer three", "D. Not indented Answer"

But when I implement it in code this does not perform very well, no matches found.

(Try it with the pastebin data)

/**
 * Created by Schwusch on 01/08/2016.
 */
$(document).ready(start);
var questionsRaw;
var questionsFormatted = [];
var questionIndex = 0;

function readSingleFile(e) {
    var file = e.target.files[0];
    if (!file) {
        return;
    }
    var reader = new FileReader();
    reader.onload = function(e) {
        var contents = e.target.result;
        displayContents(contents);
    };
    reader.readAsText(file);
}
/* REGEX MAGIC -------------------------------------------------*/
function displayContents(contents) {
    questionsRaw = contents.split('---');
    $.each(questionsRaw, function(index, question ) {
        var answer = question.split("Answer:")[1];
        var splittedQuestion = question.split("A.")[0];
        var alternatives = question.match(/(\s[A-Z]\.).+?(?=(\s[A-Z]\.)|(\W?(Answer:)\W?))/g);
        questionsFormatted.push({
            question: splittedQuestion,
            alternatives: alternatives,
            answer: answer
        });
    });
/* END REGEX MAGIC -------------------------------------------------*/
    var element = document.getElementById('file-content');
    element.innerHTML = questionsFormatted[questionIndex].question;
    for (var i = 0; i < questionsFormatted[questionIndex].alternatives.length ; i++) {
        $('#alternatives').append('<button type="button" class="list-group-item">' + questionsFormatted[questionIndex].alternatives[i] + '</button>');
    }
}
function start() {
    document.getElementById('file-input')
        .addEventListener('change', readSingleFile, false);

    $(window).keydown(function(e) {
        e = e || event;
        switch(e.keyCode) {
            case 37: // left
                previousQuestion();
                return false;
            case 38: // up
                showQuestion();
                return false;
            case 39: // right
                nextQuestion();
                return false;
            case 40: // down
                showAnswer();
                return false;
        }
    });
    $(document).on('change', ':file', function() {
        var input = $(this),
            numFiles = input.get(0).files ? input.get(0).files.length : 1,
            label = input.val().replace(/\\/g, '/').replace(/.*\//, '');
        input.trigger('fileselect', [numFiles, label]);
    });

    $(':file').on('fileselect', function(event, numFiles, label) {
        var element = document.getElementById('filechoose');
        element.innerHTML = label;
    });
}

function showAnswer() {
    var element = document.getElementById('file-content');
    element.innerHTML = questionsFormatted[questionIndex].answer;
}

function showQuestion() {
    var element = document.getElementById('file-content');
    element.innerHTML = questionsFormatted[questionIndex].question;
}

function nextQuestion() {
    if (questionIndex < questionsFormatted.length - 1) questionIndex++ ;
    else questionIndex = 0;
    var element = document.getElementById('file-content');
    element.innerHTML = questionsFormatted[questionIndex].question;
    $( ".list-group-item" ).remove();
    for (var i = 0; i < questionsFormatted[questionIndex].alternatives.length ; i++) {
        $('#alternatives').append('<button type="button" class="list-group-item">' + questionsFormatted[questionIndex].alternatives[i] + '</button>');
    }
}

function previousQuestion() {
    if (questionIndex > 0) questionIndex-- ;
    else questionIndex = questionsFormatted.length - 1;
    var element = document.getElementById('file-content');
    element.innerHTML = questionsFormatted[questionIndex].question;
    $( ".list-group-item" ).remove();
    for (var i = 0; i < questionsFormatted[questionIndex].alternatives.length ; i++) {
        $('#alternatives').append('<button type="button" class="list-group-item">' + questionsFormatted[questionIndex].alternatives[i] + '</button>');
    }
}
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Question tool</title>
    <script src="https://code.jquery.com/jquery-3.1.0.js"
            integrity="sha256-slogkvB1K3VOkzAI8QITxV3VzpOnkeNVsKvtkYLMjfk=" crossorigin="anonymous"></script>

    <!-- Latest compiled and minified CSS -->
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css"
          integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous">

    <!-- Optional theme -->
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap-theme.min.css"
          integrity="sha384-rHyoN1iRsVXV4nD0JutlnGaslCJuC7uwjduW9SVrLvRYooPp2bWYgmgJQIXwl/Sp" crossorigin="anonymous">
    <script src="script.js"></script>
    <style>
        /* Move down content */
        body {
            padding-top: 20px;
            padding-bottom: 20px;
        }

    </style>
</head>
<body>

<div>
    <div class="container">

        <div class="jumbotron">
            <h3>Question Tool</h3>
            <label class="btn btn-default btn-file" id="filechoose">
                Choose File
                <input type="file" id="file-input" style="display: none;"/>
            </label>
            <div class="btn-group btn-group-justified" role="group" aria-label="...">
                <div class="btn-group" role="group">
                    <button type="button" class="btn btn-lg btn-primary" onclick="showAnswer()" role="button">
                        <span class="glyphicon glyphicon-arrow-down" aria-hidden="true"></span>Show Answer
                    </button>
                </div>
                <div class="btn-group" role="group">
                    <button type="button" class="btn btn-lg btn-success" onclick="showQuestion()" role="button">
                        <span class="glyphicon glyphicon-arrow-up" aria-hidden="true"></span>Show Question
                    </button>
                </div>
                <div class="btn-group" role="group">
                    <button type="button" class="btn btn-lg btn-danger" onclick="previousQuestion()" role="button">
                        <span class="glyphicon glyphicon-arrow-left" aria-hidden="true"></span>Previous Question
                    </button>
                </div>
                <div class="btn-group" role="group">
                    <button type="button" class="btn btn-lg btn-info" onclick="nextQuestion()" role="button">
                        <span class="glyphicon glyphicon-arrow-right" aria-hidden="true"></span>Next Question
                    </button>
                </div>
            </div>

            <div id="file-content" class="well"></div>
            <div id="alternatives" class="list-group">
            </div>
        </div>

    </div>
</div>

</body>
</html>

Why is it working in online testers but not in a browser?

like image 367
Schwusch Avatar asked Aug 02 '16 14:08

Schwusch


1 Answers

The reason why it does not work for you, is that contrary to the text you have used during your tests on regex101.com, the file you load uses \r\n as newline sequence instead of just \n.

Add to this that by default the . meta character does not match \r, and that JavaScript does not support the s modifier which could change this behaviour, you get fewer or no matches at all.

More concretely: in the regular expression the part .+? will stop matching characters when it encounters the \r. First it does so because it looks ahead and finds it can match the \r with either \s or \W, but the next \n does not match with either [A-Z] or the A of Answer:. So it then backtracks and tries to continue with the .+? part, but that also fails because \r cannot match with that as explained above. So the matching process then starts at the beginning of the regular expression again to find a potential next match. And this fails again and again for the same reasons.

To fix this, change two things:

  • Add a + after the \s in the middle, so it will not only match the \n, but also the preceding \r.

  • Change \W? to \W*, so again, it can match the preceding \r as well.

This should work:

/(\s[A-Z]\.).+?(?=(\s+[A-Z]\.)|(\W*(Answer:)\W?))/g

While this solves it, I would also suggest to simplify this regex further to:

/\s[A-Z]\..+?(?=\s+[A-Z]\.|\W*Answer:)/g

Notably the \W? at the end makes little sense: it either matches the \W or not, and in both cases you accept it.

like image 92
trincot Avatar answered Oct 19 '22 06:10

trincot