Regex to replace special characters with escape sequence in XML string

Question

I have a XML string which contains some special characters(<,>,&) in it and hence can not be parsed by using jQuery $.parseXML.

This is the sample XML string

<?xml version="1.0" encoding="UTF-8"?>
<BackgroundCheck userId="{Username}" password="{Password}">
  <BackgroundSearchPackage action="submit" type="{PackageName}">
    <ReferenceId>ab<</ReferenceId>
    <UserArea>
      <PositionDetail>
        <EmploymentState>{StateJob}</EmploymentState>
        <ProposedSalary>{AnnualSalary}</ProposedSalary>
      </PositionDetail>
    </UserArea>
    <PersonalData>
      <PersonName>
        <GivenName>{FirstName}</GivenName>
        <MiddleName>{MiddleName}</MiddleName>
        <FamilyName>{LastName}</FamilyName>
        <Affix>{Generation}</Affix>
      </PersonName>
      <EmailAddress>{Email}</EmailAddress>
      <DemographicDetail>
        <GovernmentId countryCode="US" issuingAuthority="SSN">{SSN}</GovernmentId>
        <DateOfBirth>{DateOfBirth}</DateOfBirth>
      </DemographicDetail>
      {Aliases}
      {PostalAddress}
    </PersonalData>
    <Screenings useConfigurationDefaults="no">
      {Screenings}
      <AdditionalItems type="x:interface">
        <Text>{Search&Type}</Text>
      </AdditionalItems>
      <AdditionalItems type="x:return_xml_results">
        <Text>yes</Text>
      </AdditionalItems>
      <AdditionalItems type="x:embed_credentials">
        <Text>true</Text>
      </AdditionalItems>
      <AdditionalItems type="x:integration_type">
        <Text>Sample XML</Text>
      </AdditionalItems>
      <AdditionalItems type="x:postback_url">
        <Text>{CallbackURL}</Text>
      </AdditionalItems>
      {AdditionalItems}
    </Screenings>
    {Documentation}
  </BackgroundSearchPackage>
</BackgroundCheck>

Note the value of tag ReferenceId on 4th line, it contains special character and hence this string can not be parsed to XML.

What I need is to replace those special characters with escape sequences(<,>,&). The closest I came across is this

how to escape xml entities in javascript?

But this answer assumes that we have XML node values already with us.

My requirements is different, I have the complete xml as a string and I want to replace only the node values without touching the tag names(tags also contain <,>).

This is what i tried using jQuery

$(xml).each(function() {
            var t = $(this).wrap('<p/>').parent().html();

            t.replace(/&/g, '&amp;')
                   .replace(/</g, '&lt;')
                   .replace(/>/g, '&gt;')
                   .replace(/"/g, '&quot;')
                   .replace(/'/g, '&apos;');
            xml = t;
        });

This is working fine, the only problem with this code is that it is converting the XML tags to lower case. I thing this is because of jQuery's behavior.

Please suggest be a fix/solution for this.Thanks

@mplungjan can you be kind enough to point out what's wrong with this question? — Vishal Dubey
– Vishal Dubey, Commented Jan 12, 2018 at 6:36
Please post effort and CODE - what did you try? Example input and example failing output? — mplungjan
– mplungjan, Commented Jan 12, 2018 at 6:37
@mplungjan Edited my Question with what i tried. Any help now? — Vishal Dubey
– Vishal Dubey, Commented Jan 12, 2018 at 6:49
Please add an XML example. Click <> and create a minimal reproducible example — mplungjan
– mplungjan, Commented Jan 12, 2018 at 7:23

mplungjan · Accepted Answer · 2018-01-12 09:58:19Z

2

"text/html" works better -
Use the html to convert the textContent

var oParser = new DOMParser();

function replaceIllegalXML(t) {
  var oDOM = oParser.parseFromString(t, "text/html");
  var nok = oDOM.documentElement.nodeName == "parsererror";
  if (nok) {
    console.log("Could not parse the string");
    return;
  }
  var allTexts = oDOM.documentElement.textContent.split("\n");
  for (var i=0;i<allTexts.length;i++) {
    var repl = allTexts[i].replace(/&/g, '&amp;')
                   .replace(/</g, '\&lt;')
                   .replace(/>/g, '\&gt;')
                   .replace(/"/g, '\&quot;')
                   .replace(/'/g, '\&apos;')
    if (repl != allTexts[i]) {
      repl = repl.trim();
      var re = new RegExp(">"+allTexts[i].trim()+"<");
      console.log("string:",allTexts[i])
      console.log("replace",repl)
      console.log("re",re)
      t = t.replace(re,">"+repl+"<");
    }
  }
  return t;
}
var t = `<?xml version="1.0" encoding="UTF-8"?>
<BackgroundCheck userId="{Username}" password="{Password}">
  <BackgroundSearchPackage action="submit" type="{PackageName}">
    <ReferenceId>ab<</ReferenceId>
    <UserArea>
      <PositionDetail>
        <EmploymentState>{StateJob}</EmploymentState>
        <ProposedSalary>{AnnualSalary}</ProposedSalary>
      </PositionDetail>
    </UserArea>
    <PersonalData>
      <PersonName>
        <GivenName>{FirstName}</GivenName>
        <MiddleName>{MiddleName}</MiddleName>
        <FamilyName>{LastName}</FamilyName>
        <Affix>{Generation}</Affix>
      </PersonName>
      <EmailAddress>{Email}</EmailAddress>
      <DemographicDetail>
        <GovernmentId countryCode="US" issuingAuthority="SSN">{SSN}</GovernmentId>
        <DateOfBirth>{DateOfBirth}</DateOfBirth>
      </DemographicDetail>
      {Aliases}
      {PostalAddress}
    </PersonalData>
    <Screenings useConfigurationDefaults="no">
      {Screenings}
      <AdditionalItems type="x:interface">
        <Text>{Search&Type}</Text>
      </AdditionalItems>
      <AdditionalItems type="x:return_xml_results">
        <Text>yes</Text>
      </AdditionalItems>
      <AdditionalItems type="x:embed_credentials">
        <Text>true</Text>
      </AdditionalItems>
      <AdditionalItems type="x:integration_type">
        <Text>Sample XML</Text>
      </AdditionalItems>
      <AdditionalItems type="x:postback_url">
        <Text>{CallbackURL}</Text>
      </AdditionalItems>
      {AdditionalItems}
    </Screenings>
    {Documentation}
  </BackgroundSearchPackage>
</BackgroundCheck>`

t = replaceIllegalXML(t);
var newDOM = oParser.parseFromString(t, "text/xml")
var nok = newDOM.documentElement.nodeName == "parsererror";
if (nok) console.log("xml parsing failed");
else console.log(newDOM.getElementsByTagName("ReferenceId")[0].textContent);

edited Jan 12, 2018 at 9:58

answered Jan 12, 2018 at 9:07

mplungjan

180k29 gold badges183 silver badges246 bronze badges

Sign up to request clarification or add additional context in comments.

10 Comments

Vishal Dubey Over a year ago

Thanks for the answer :). parsing as 'text/html' converts all the tags to lowercase. For example,BackgroundCheck becomes backgroundcheck. Is there some way to fix it?

mplungjan Over a year ago

Have a look. Perhaps my solution works for you? It is not really where you need to fix the issue...

mplungjan Over a year ago

PS: You MAY want to do .replace(/&/g, '&amp;')

Vishal Dubey Over a year ago

Wonderful answer. However, it failed with one of my XML string which had the node <Data>CR-CRIMINAL MISCHIEF ( < =$200)</Data>. I think this is because the value has braces() in it.

mplungjan Over a year ago

Or the $ has a special regex meaning and needs to be escaped

|

Vishal Dubey · Accepted Answer · 2018-01-15 11:27:52Z

I finally achieved what i needed. Thanks to @mplungjan. As he also pointed out, an XML file must be valid to get parsed and making it valid should be done where the XML is created.

My scenario was somewhat different. I HAD TO fix the invalid XML string in javascript only, before i could parse it.

I had to apply a dirty hack to achive it. While parsing the XML string in javascript(or any other programming language), we get an error specifying whats wrong AND the line number. What i did was this

Retrieve the line number from error.
Extract that line from the string and fix it.
Repeat the process till the complete XML is fixed.

Here's what i did

var oParser = new DOMParser();

        function escapeRegExp(str) {
          return str.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&");
        }

        function remove_error(str, line) {
            var allTexts = str.split("\n");
            var illegal = allTexts[line - 1];
            var extract = illegal.match(/>(.*)</);

            extract = extract.pop();
            var fix_extract = extract.replace(/&/g, '&amp;')
               .replace(/</g, '&lt;')
               .replace(/>/g, '&gt;')
               .replace(/"/g, '&quot;')
               .replace(/'/g, '&apos;');

            fix_extract = fix_extract.trim();console.log(fix_extract);
            var re = new RegExp(">"+escapeRegExp(extract.trim())+"<");
            str = str.replace(re,">"+fix_extract+"<");
            return str;
        }

        var fixed = "", final_fixed = "";

        function process(orig_str) {
            var newDOM = oParser.parseFromString(orig_str, "text/xml");
            var error = newDOM.getElementsByTagName("parsererror");

            if (error && error.length) {
                if (error[0] && error[0].innerHTML) {
                    var err_html = error[0].innerHTML;
                    var parse_err = oParser.parseFromString(err_html, "text/html");
                    var err_log = parse_err.getElementsByTagName("div")[0].innerHTML;
                    console.log(err_log);
                    //var string = err_log.substring(err_log.lastIndexOf("line")+1, err_log.lastIndexOf("at"));
                    var regex = /line\s*(.*?)\s*at/g;

                    var matches = [];
                    while (m = regex.exec(err_log)) {
                      matches.push(m[1]);
                    }console.log(matches);
                    var err_line = matches[0];
                    fixed = remove_error(orig_str, err_line);
                    if (fixed) {console.log(fixed);//return;
                        process(fixed);
                    }
                    //console.log(fixed);
                    //alert('Invalid XML:' + err_log);
                } else {
                    alert('XML could not be parsed');
                    return;
                }
                $('.welcome-page section.welcome .inner').html("<h3 class='text-center'>Invalid XML</h3>");
            } else {
                final_fixed = orig_str;
            }

            if(final_fixed) {
                return final_fixed;
            }
        }
var newDOM = process(res[0][0]);

        if (!newDOM) {
            alert('XML could not be parsed');
            return;
        }

I know what i did is just a hack. But i didn't have any other options.

PS- Any edits to this answer are welcome.

Collectives™ on Stack Overflow

Regex to replace special characters with escape sequence in XML string

2 Answers 2

10 Comments

Comments

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

10 Comments

Comments

Your Answer

Sign up or log in

Post as a guest

Linked

Related