The code below iterates through the nodes of an XML file and updates values based on a Regex expression in the rule child node from an XPath expression. XML is included at the bottom.
Are there better alternatives to this approach? Would using LINQ be a good approach?
using System;
using System.Text.RegularExpressions;
using System.Xml;
using System.Xml.XPath;
namespace XMLParser
{
class Program
{
static void Main()
{
string ocrString = "";
string rule = "";
string output = "";
string dataNodeIDValue = "";
string dataNodeIDName = "";
string xpathStr = "";
Match match;
int groupInt = 0;
string filename = "C:\\Users\\name\\train\\dev\\offer\\TestParsing.xml";
XmlDocument xmlDoc = new XmlDocument();
xmlDoc.Load(filename);
XmlElement root = xmlDoc.DocumentElement;
XmlNodeList nodes = root.SelectNodes("//offer/data");
XPathNavigator xnav = xmlDoc.CreateNavigator();
// Read in all 'data' nodes and perform functions
foreach (XmlNode node in nodes)
{
// Set to 0 so regex matches first match unless otherwise specified
groupInt = 0;
// Cycle through inner nodes of main node and pull in values
foreach (XmlNode xmlNode in node.ChildNodes)
{
switch (xmlNode.Name)
{
case "ocrstring":
ocrString = xmlNode.InnerText;
break;
case "rule":
rule = xmlNode.InnerText;
break;
case "group":
//groupInt = xmlNode.InnerText;
if (Int32.TryParse(xmlNode.InnerText, out groupInt)) { groupInt = Int32.Parse(xmlNode.InnerText); }
break;
}
}
// No rule given because ocr works effectively
if (rule.Length < 2) { continue; }
// If ocrstring is empty try finding text in pdf
if (String.IsNullOrEmpty(ocrString) | String.IsNullOrWhiteSpace(ocrString)) // This is to iterate through pdf
{
// TODO: Implement over full text doc <- ignore for now
}
else // This is to use XML string
{
var regex = new Regex(rule);
match = regex.Match(ocrString);
}
//if (match.Groups.Count > 0) { };
if (groupInt > 0 & match.Groups.Count > 0)
{
output = match.Groups[groupInt].Value.ToString();
}
else
{
output = match.Value.ToString().Trim();
}
dataNodeIDValue = node.Attributes[0].Value;
dataNodeIDName = node.Attributes[0].Name;
xpathStr = "//offer/data[@" + dataNodeIDName + "='" + dataNodeIDValue + "']/output";
if (String.IsNullOrEmpty(output))
{
root.SelectSingleNode(xpathStr).InnerText = "NA";
}
else
{
root.SelectSingleNode(xpathStr).InnerText = output;
}
xmlDoc.Save(filename); // Save XML session back to file
}
Console.WriteLine("Exiting...");
}
}
}
XML Data
<?xml version="1.0" encoding="utf-8"?>
<offer>
<data id="Salary">
<ocrstring>which is equal to $40,000.00 if working 40 hours per week</ocrstring>
<rule>.*(([+-]?\$[0-9]{1,3}(?:,?[0-9]{3})*\.[0-9]{2}))</rule>
<group>1</group>
<output></output>
</data>
<data id="DefaultWeeklyHours">
<ocrstring></ocrstring>
<rule><![CDATA["(?<=working).*?(?=hours)"]]></rule>
<output></output>
</data>
<data id="RelocationAttachment">
<ocrstring>LongWindingRoad222</ocrstring>
<rule>Regex2</rule>
<output></output>
</data>
</offer>