EDN Admin
Well-known member
I have to extract specific elements from an xml document, compress them as a string and replace the original elements with the compressed string.
Heres an example of the document..
<pre class="prettyprint <?xml version="1.0" encoding="UTF-8"?>
<GovTalkMessage xmlns="http://www.govtalk.gov.uk/CM/envelope
<EnvelopeVersion>2.0</EnvelopeVersion>
<Body>
<IRenvelope xmlns="http://www.govtalk.gov.uk/taxation/EOY/11-12/1
<EndOfYearReturn>
<ReturnType>original</ReturnType>
<P14>
<Name>
<Fore>Abc</Fore>
<Sur>Def</Sur>
</Name>
</P14>
<P14>
<Name>
<Fore>Ghi</Fore>
<Sur>Jkl</Sur>
</Name>
</P14>
</EndOfYearReturn>
</IRenvelope>
</Body>
</GovTalkMessage>
[/code]
The problem Im having is that when I iterate through the P14 elements to extract them, when I do a xNode.outerxml to extract the text, its adding the namespace to the string. I can get rid of it using a regex, but is there a way of just extracting the
text without having to resort to regex? Heres the console app test code Im currently using..
<pre class="prettyprint lang-vb Imports System.IO
Imports System.IO.Compression
Imports System.Xml
Imports System.Text
Imports System.Text.RegularExpressions
Module module1
Sub main()
Dim xDoc As New XmlDocument
Dim xNode As XmlNode
Dim GovTalkMessageNS As String = String.Empty
Dim IRenvelopeNS As String = String.Empty
Dim NewAttr As XmlAttribute
Dim eoyElement As XmlElement
Dim SB As New System.Text.StringBuilder
Dim nsmgr As XmlNamespaceManager
Dim nodelist As XmlNodeList
Try
load the xml string into an XMLDocument
xDoc.Load("p14.xml")
find the namespace of the GovTalkMessage element
For Each xNode In xDoc.ChildNodes
If xNode.Name = "GovTalkMessage" Then
GovTalkMessageNS = xNode.Attributes("xmlns").Value
Exit For
End If
Next
If GovTalkMessageNS = String.Empty Then
Throw New Exception("GovTalkMessage is missing")
End If
Create a namespace manager for the GovTalkMessage namespace
nsmgr = New XmlNamespaceManager(xDoc.NameTable)
nsmgr.AddNamespace("gt", GovTalkMessageNS)
Find the first child of the body element (always IRenvelope)
Cant find via //gt:Body/gt:IRenvelope
xNode = xDoc.SelectSingleNode("//gt:Body", nsmgr).FirstChild
IRenvelopeNS = xNode.Attributes("xmlns").Value
nsmgr.AddNamespace("ir", IRenvelopeNS)
see if theres an EndOfYearReturn section.. if so, compress the P14 elements
xNode = xDoc.SelectSingleNode("//ir:EndOfYearReturn", nsmgr)
If xNode IsNot Nothing Then
For Each x14Node As XmlNode In xNode.ChildNodes
If x14Node.Name <> "P14" Then Continue For
#############################
### WHY does the x14Node.outerxml contain the xmlns namespace when original xml doesnt????
#############################
SB.Append(x14Node.OuterXml)
Next
Console.WriteLine(SB.ToString)
remove all the old P14 nodes
nodelist = xDoc.SelectNodes("//ir:EndOfYearReturn/ir14", nsmgr)
For Each x14Node As XmlNode In nodelist
xNode.RemoveChild(x14Node)
Next
If SB.Length > 0 Then
create new element to hold the compressed P14s
eoyElement = xDoc.CreateElement("CompressedPart", IRenvelopeNS)
NewAttr = xDoc.CreateAttribute("Type")
NewAttr.Value = "gzip"
eoyElement.Attributes.Append(NewAttr)
get rid of the xmlns namespace from the string,
compress and encode to base64 and store in new element
eoyElement.InnerText = gZip(Regex.Replace(SB.ToString, " xmlns=""([^""]*)""", ""))
xNode.InsertAfter(eoyElement, xDoc.SelectSingleNode("//ir:ReturnType", nsmgr))
End If
End If
Catch ex As Exception
Console.WriteLine("ERROR:-")
Console.WriteLine(ex.Message)
End Try
display indented xml
Console.WriteLine("")
Dim builder = New StringBuilder()
Dim settings = New XmlWriterSettings() With {.Indent = True}
Using writer = XmlWriter.Create(builder, settings)
xDoc.WriteTo(writer)
End Using
Console.WriteLine(builder.ToString())
Console.WriteLine(Environment.NewLine + "Press return")
Console.ReadLine()
End Sub
Public Function gZip(ByVal Value As String) As String
Dim uncompressedBytes As Byte() = Encoding.UTF8.GetBytes(Value)
Dim compressedBytes As Byte() = Nothing
Using ms As New MemoryStream()
Using gs As New GZipStream(ms, CompressionMode.Compress)
gs.Write(uncompressedBytes, 0, uncompressedBytes.Length)
End Using
compressedBytes = ms.ToArray()
End Using
Return Convert.ToBase64String(compressedBytes)
End Function
End Module
[/code]
<br/>
View the full article
Heres an example of the document..
<pre class="prettyprint <?xml version="1.0" encoding="UTF-8"?>
<GovTalkMessage xmlns="http://www.govtalk.gov.uk/CM/envelope
<EnvelopeVersion>2.0</EnvelopeVersion>
<Body>
<IRenvelope xmlns="http://www.govtalk.gov.uk/taxation/EOY/11-12/1
<EndOfYearReturn>
<ReturnType>original</ReturnType>
<P14>
<Name>
<Fore>Abc</Fore>
<Sur>Def</Sur>
</Name>
</P14>
<P14>
<Name>
<Fore>Ghi</Fore>
<Sur>Jkl</Sur>
</Name>
</P14>
</EndOfYearReturn>
</IRenvelope>
</Body>
</GovTalkMessage>
[/code]
The problem Im having is that when I iterate through the P14 elements to extract them, when I do a xNode.outerxml to extract the text, its adding the namespace to the string. I can get rid of it using a regex, but is there a way of just extracting the
text without having to resort to regex? Heres the console app test code Im currently using..
<pre class="prettyprint lang-vb Imports System.IO
Imports System.IO.Compression
Imports System.Xml
Imports System.Text
Imports System.Text.RegularExpressions
Module module1
Sub main()
Dim xDoc As New XmlDocument
Dim xNode As XmlNode
Dim GovTalkMessageNS As String = String.Empty
Dim IRenvelopeNS As String = String.Empty
Dim NewAttr As XmlAttribute
Dim eoyElement As XmlElement
Dim SB As New System.Text.StringBuilder
Dim nsmgr As XmlNamespaceManager
Dim nodelist As XmlNodeList
Try
load the xml string into an XMLDocument
xDoc.Load("p14.xml")
find the namespace of the GovTalkMessage element
For Each xNode In xDoc.ChildNodes
If xNode.Name = "GovTalkMessage" Then
GovTalkMessageNS = xNode.Attributes("xmlns").Value
Exit For
End If
Next
If GovTalkMessageNS = String.Empty Then
Throw New Exception("GovTalkMessage is missing")
End If
Create a namespace manager for the GovTalkMessage namespace
nsmgr = New XmlNamespaceManager(xDoc.NameTable)
nsmgr.AddNamespace("gt", GovTalkMessageNS)
Find the first child of the body element (always IRenvelope)
Cant find via //gt:Body/gt:IRenvelope
xNode = xDoc.SelectSingleNode("//gt:Body", nsmgr).FirstChild
IRenvelopeNS = xNode.Attributes("xmlns").Value
nsmgr.AddNamespace("ir", IRenvelopeNS)
see if theres an EndOfYearReturn section.. if so, compress the P14 elements
xNode = xDoc.SelectSingleNode("//ir:EndOfYearReturn", nsmgr)
If xNode IsNot Nothing Then
For Each x14Node As XmlNode In xNode.ChildNodes
If x14Node.Name <> "P14" Then Continue For
#############################
### WHY does the x14Node.outerxml contain the xmlns namespace when original xml doesnt????
#############################
SB.Append(x14Node.OuterXml)
Next
Console.WriteLine(SB.ToString)
remove all the old P14 nodes
nodelist = xDoc.SelectNodes("//ir:EndOfYearReturn/ir14", nsmgr)
For Each x14Node As XmlNode In nodelist
xNode.RemoveChild(x14Node)
Next
If SB.Length > 0 Then
create new element to hold the compressed P14s
eoyElement = xDoc.CreateElement("CompressedPart", IRenvelopeNS)
NewAttr = xDoc.CreateAttribute("Type")
NewAttr.Value = "gzip"
eoyElement.Attributes.Append(NewAttr)
get rid of the xmlns namespace from the string,
compress and encode to base64 and store in new element
eoyElement.InnerText = gZip(Regex.Replace(SB.ToString, " xmlns=""([^""]*)""", ""))
xNode.InsertAfter(eoyElement, xDoc.SelectSingleNode("//ir:ReturnType", nsmgr))
End If
End If
Catch ex As Exception
Console.WriteLine("ERROR:-")
Console.WriteLine(ex.Message)
End Try
display indented xml
Console.WriteLine("")
Dim builder = New StringBuilder()
Dim settings = New XmlWriterSettings() With {.Indent = True}
Using writer = XmlWriter.Create(builder, settings)
xDoc.WriteTo(writer)
End Using
Console.WriteLine(builder.ToString())
Console.WriteLine(Environment.NewLine + "Press return")
Console.ReadLine()
End Sub
Public Function gZip(ByVal Value As String) As String
Dim uncompressedBytes As Byte() = Encoding.UTF8.GetBytes(Value)
Dim compressedBytes As Byte() = Nothing
Using ms As New MemoryStream()
Using gs As New GZipStream(ms, CompressionMode.Compress)
gs.Write(uncompressedBytes, 0, uncompressedBytes.Length)
End Using
compressedBytes = ms.ToArray()
End Using
Return Convert.ToBase64String(compressedBytes)
End Function
End Module
[/code]
<br/>
View the full article