0

I have a XML string which contains some special characters(<,>,&) in it and hence can not be parsed by using jQuery $.parseXML.

This is the sample XML string

<?xml version="1.0" encoding="UTF-8"?>
<BackgroundCheck userId="{Username}" password="{Password}">
  <BackgroundSearchPackage action="submit" type="{PackageName}">
    <ReferenceId>ab<</ReferenceId>
    <UserArea>
      <PositionDetail>
        <EmploymentState>{StateJob}</EmploymentState>
        <ProposedSalary>{AnnualSalary}</ProposedSalary>
      </PositionDetail>
    </UserArea>
    <PersonalData>
      <PersonName>
        <GivenName>{FirstName}</GivenName>
        <MiddleName>{MiddleName}</MiddleName>
        <FamilyName>{LastName}</FamilyName>
        <Affix>{Generation}</Affix>
      </PersonName>
      <EmailAddress>{Email}</EmailAddress>
      <DemographicDetail>
        <GovernmentId countryCode="US" issuingAuthority="SSN">{SSN}</GovernmentId>
        <DateOfBirth>{DateOfBirth}</DateOfBirth>
      </DemographicDetail>
      {Aliases}
      {PostalAddress}
    </PersonalData>
    <Screenings useConfigurationDefaults="no">
      {Screenings}
      <AdditionalItems type="x:interface">
        <Text>{Search&Type}</Text>
      </AdditionalItems>
      <AdditionalItems type="x:return_xml_results">
        <Text>yes</Text>
      </AdditionalItems>
      <AdditionalItems type="x:embed_credentials">
        <Text>true</Text>
      </AdditionalItems>
      <AdditionalItems type="x:integration_type">
        <Text>Sample XML</Text>
      </AdditionalItems>
      <AdditionalItems type="x:postback_url">
        <Text>{CallbackURL}</Text>
      </AdditionalItems>
      {AdditionalItems}
    </Screenings>
    {Documentation}
  </BackgroundSearchPackage>
</BackgroundCheck>

Note the value of tag ReferenceId on 4th line, it contains special character and hence this string can not be parsed to XML.

What I need is to replace those special characters with escape sequences(<,>,&). The closest I came across is this

how to escape xml entities in javascript?

But this answer assumes that we have XML node values already with us.

My requirements is different, I have the complete xml as a string and I want to replace only the node values without touching the tag names(tags also contain <,>).

This is what i tried using jQuery

$(xml).each(function() {
            var t = $(this).wrap('<p/>').parent().html();

            t.replace(/&/g, '&amp;')
                   .replace(/</g, '&lt;')
                   .replace(/>/g, '&gt;')
                   .replace(/"/g, '&quot;')
                   .replace(/'/g, '&apos;');
            xml = t;
        });

This is working fine, the only problem with this code is that it is converting the XML tags to lower case. I thing this is because of jQuery's behavior.

Please suggest be a fix/solution for this.Thanks

11
  • Welcome to SO. Please visit How to Ask Commented Jan 12, 2018 at 6:27
  • @mplungjan can you be kind enough to point out what's wrong with this question? Commented Jan 12, 2018 at 6:36
  • Please post effort and CODE - what did you try? Example input and example failing output? Commented Jan 12, 2018 at 6:37
  • @mplungjan Edited my Question with what i tried. Any help now? Commented Jan 12, 2018 at 6:49
  • Please add an XML example. Click <> and create a minimal reproducible example Commented Jan 12, 2018 at 7:23

2 Answers 2

2
  1. "text/html" works better -
  2. Use the html to convert the textContent

var oParser = new DOMParser();

function replaceIllegalXML(t) {
  var oDOM = oParser.parseFromString(t, "text/html");
  var nok = oDOM.documentElement.nodeName == "parsererror";
  if (nok) {
    console.log("Could not parse the string");
    return;
  }
  var allTexts = oDOM.documentElement.textContent.split("\n");
  for (var i=0;i<allTexts.length;i++) {
    var repl = allTexts[i].replace(/&/g, '&amp;')
                   .replace(/</g, '\&lt;')
                   .replace(/>/g, '\&gt;')
                   .replace(/"/g, '\&quot;')
                   .replace(/'/g, '\&apos;')
    if (repl != allTexts[i]) {
      repl = repl.trim();
      var re = new RegExp(">"+allTexts[i].trim()+"<");
      console.log("string:",allTexts[i])
      console.log("replace",repl)
      console.log("re",re)
      t = t.replace(re,">"+repl+"<");
    }
  }
  return t;
}
var t = `<?xml version="1.0" encoding="UTF-8"?>
<BackgroundCheck userId="{Username}" password="{Password}">
  <BackgroundSearchPackage action="submit" type="{PackageName}">
    <ReferenceId>ab<</ReferenceId>
    <UserArea>
      <PositionDetail>
        <EmploymentState>{StateJob}</EmploymentState>
        <ProposedSalary>{AnnualSalary}</ProposedSalary>
      </PositionDetail>
    </UserArea>
    <PersonalData>
      <PersonName>
        <GivenName>{FirstName}</GivenName>
        <MiddleName>{MiddleName}</MiddleName>
        <FamilyName>{LastName}</FamilyName>
        <Affix>{Generation}</Affix>
      </PersonName>
      <EmailAddress>{Email}</EmailAddress>
      <DemographicDetail>
        <GovernmentId countryCode="US" issuingAuthority="SSN">{SSN}</GovernmentId>
        <DateOfBirth>{DateOfBirth}</DateOfBirth>
      </DemographicDetail>
      {Aliases}
      {PostalAddress}
    </PersonalData>
    <Screenings useConfigurationDefaults="no">
      {Screenings}
      <AdditionalItems type="x:interface">
        <Text>{Search&Type}</Text>
      </AdditionalItems>
      <AdditionalItems type="x:return_xml_results">
        <Text>yes</Text>
      </AdditionalItems>
      <AdditionalItems type="x:embed_credentials">
        <Text>true</Text>
      </AdditionalItems>
      <AdditionalItems type="x:integration_type">
        <Text>Sample XML</Text>
      </AdditionalItems>
      <AdditionalItems type="x:postback_url">
        <Text>{CallbackURL}</Text>
      </AdditionalItems>
      {AdditionalItems}
    </Screenings>
    {Documentation}
  </BackgroundSearchPackage>
</BackgroundCheck>`

t = replaceIllegalXML(t);
var newDOM = oParser.parseFromString(t, "text/xml")
var nok = newDOM.documentElement.nodeName == "parsererror";
if (nok) console.log("xml parsing failed");
else console.log(newDOM.getElementsByTagName("ReferenceId")[0].textContent);

Sign up to request clarification or add additional context in comments.

10 Comments

Thanks for the answer :). parsing as 'text/html' converts all the tags to lowercase. For example,BackgroundCheck becomes backgroundcheck. Is there some way to fix it?
Have a look. Perhaps my solution works for you? It is not really where you need to fix the issue...
PS: You MAY want to do .replace(/&/g, '&amp;amp;')
Wonderful answer. However, it failed with one of my XML string which had the node <Data>CR-CRIMINAL MISCHIEF ( < =$200)</Data>. I think this is because the value has braces() in it.
Or the $ has a special regex meaning and needs to be escaped
|
1

I finally achieved what i needed. Thanks to @mplungjan. As he also pointed out, an XML file must be valid to get parsed and making it valid should be done where the XML is created.

My scenario was somewhat different. I HAD TO fix the invalid XML string in javascript only, before i could parse it.

I had to apply a dirty hack to achive it. While parsing the XML string in javascript(or any other programming language), we get an error specifying whats wrong AND the line number. What i did was this

  • Retrieve the line number from error.
  • Extract that line from the string and fix it.
  • Repeat the process till the complete XML is fixed.

Here's what i did

var oParser = new DOMParser();

        function escapeRegExp(str) {
          return str.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&");
        }

        function remove_error(str, line) {
            var allTexts = str.split("\n");
            var illegal = allTexts[line - 1];
            var extract = illegal.match(/>(.*)</);

            extract = extract.pop();
            var fix_extract = extract.replace(/&/g, '&amp;')
               .replace(/</g, '&lt;')
               .replace(/>/g, '&gt;')
               .replace(/"/g, '&quot;')
               .replace(/'/g, '&apos;');

            fix_extract = fix_extract.trim();console.log(fix_extract);
            var re = new RegExp(">"+escapeRegExp(extract.trim())+"<");
            str = str.replace(re,">"+fix_extract+"<");
            return str;
        }

        var fixed = "", final_fixed = "";

        function process(orig_str) {
            var newDOM = oParser.parseFromString(orig_str, "text/xml");
            var error = newDOM.getElementsByTagName("parsererror");

            if (error && error.length) {
                if (error[0] && error[0].innerHTML) {
                    var err_html = error[0].innerHTML;
                    var parse_err = oParser.parseFromString(err_html, "text/html");
                    var err_log = parse_err.getElementsByTagName("div")[0].innerHTML;
                    console.log(err_log);
                    //var string = err_log.substring(err_log.lastIndexOf("line")+1, err_log.lastIndexOf("at"));
                    var regex = /line\s*(.*?)\s*at/g;

                    var matches = [];
                    while (m = regex.exec(err_log)) {
                      matches.push(m[1]);
                    }console.log(matches);
                    var err_line = matches[0];
                    fixed = remove_error(orig_str, err_line);
                    if (fixed) {console.log(fixed);//return;
                        process(fixed);
                    }
                    //console.log(fixed);
                    //alert('Invalid XML:' + err_log);
                } else {
                    alert('XML could not be parsed');
                    return;
                }
                $('.welcome-page section.welcome .inner').html("<h3 class='text-center'>Invalid XML</h3>");
            } else {
                final_fixed = orig_str;
            }

            if(final_fixed) {
                return final_fixed;
            }
        }
var newDOM = process(res[0][0]);

        if (!newDOM) {
            alert('XML could not be parsed');
            return;
        }

I know what i did is just a hack. But i didn't have any other options.

PS- Any edits to this answer are welcome.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.