浅谈XML压缩算法

开发 算法
在用XML传输数据的时候,XML本身的元素名称,属性名称可能比有效的信息量占的地方还要大,本文示例一种简单实用的算法来进行XML压缩,主要思路是把XML标签和属性用整数来表示以便降低传输量。

XML压缩单元测试代码

  1. class Program {  
  2.    public static string XML = @"<?xml version=""1.0"" encoding=""utf-16""?> 
  3.     <Customer> 
  4. <CustomerID>ALFKI</CustomerID> 
  5. <PO>9572658</PO> 
  6. <Address AddressType=""work""> 
  7.     <Street>One Main Street</Street> 
  8.     <City>Anywhere</City> 
  9.     <State>NJ</State> 
  10.     <Zip>08080</Zip> 
  11. </Address> 
  12. <Order> 
  13.     <OrderID>10966</OrderID > 
  14.     <LineItem> 
  15.         <ProductID>37</ProductID> 
  16.         <UnitPrice>26.50 </UnitPrice> 
  17.         <Quantity>8</Quantity> 
  18.         <Description>Gravad lax </Description>               
  19.     </LineItem> 
  20.     <LineItem> 
  21.         <ProductID>56 </ProductID> 
  22.         <UnitPrice>38.00</UnitPrice> 
  23.         <Quantity>12</Quantity> 
  24.         <Description>Gnocchi di nonna Alice</Description>               
  25.     </LineItem> 
  26. </Order>      
  27. </Customer>";  
  28. static void Main(string[] args) {  
  29.     XmlZip zip = new XmlZip();  
  30.  
  31.     byte[] bs = Encoding.UTF8.GetBytes(XML);  
  32.     Console.WriteLine("原始文件长度:{0}", bs.Length);  
  33.     MemoryStream ms = new MemoryStream();  
  34.     DeflateStream compressedzipStream = new DeflateStream(ms, CompressionMode.Compress, true);  
  35.     compressedzipStream.Write(bs, 0, bs.Length);  
  36.     compressedzipStream.Close();  
  37.     Console.WriteLine("Deflate压缩后长度: {0}", ms.Length);  
  38.  
  39.     zip.Init(XML);  
  40.     bs = zip.XmlToBytes(XML);  
  41.     Console.WriteLine("XML压缩后长度:{0}", bs.Length);  
  42.     string str = zip.BytesToXml(bs);  
  43.     Console.WriteLine("还原后长度:{0}", Encoding.UTF8.GetByteCount(str));  
  44.     Console.WriteLine(str);  
  45.  
  46.  
  47.     ms = new MemoryStream();  
  48.     compressedzipStream = new DeflateStream(ms, CompressionMode.Compress, true);  
  49.     compressedzipStream.Write(bs, 0, bs.Length);  
  50.     compressedzipStream.Close();  
  51.     Console.WriteLine("先XML压缩,再Deflate压缩后的长度:{0}", ms.Length);  
  52.     Console.ReadKey();  
  53.  
  54. }  

测试输出

原始文件长度:740

Deflate压缩后长度: 438

XML压缩后长度:295

还原后长度:727

  1. <?xml version="1.0" encoding="utf-16"?> 
  2. <Customer> 
  3.   <CustomerID>ALFKI</CustomerID> 
  4.   <PO>9572658</PO> 
  5.   <Address AddressType="work"> 
  6.     <Street>One Main Street</Street> 
  7.     <City>Anywhere</City> 
  8.     <State>NJ</State> 
  9.     <Zip>08080</Zip> 
  10.   </Address> 
  11.   <Order> 
  12.     <OrderID>10966</OrderID> 
  13.     <LineItem> 
  14.       <ProductID>37</ProductID> 
  15.       <UnitPrice>26.50 </UnitPrice> 
  16.       <Quantity>8</Quantity> 
  17.       <Description>Gravad lax </Description>               
  18.     </LineItem> 
  19.     <LineItem> 
  20.       <ProductID>56 </ProductID> 
  21.       <UnitPrice>38.00</UnitPrice> 
  22.       <Quantity>12</Quantity> 
  23.       <Description>Gnocchi di nonna Alice</Description>               
  24.     </LineItem> 
  25.   </Order> 
  26. </Customer> 

先XML压缩,再Deflate压缩后的长度:357

可以看到,XML压缩后的数据约是原来数据的3分之一,可能没有其它专有的压缩算法的压缩率高,但效果还算是满意吧,而且我的算法是比较通用的,只要通信双方知道了XML的Schema,甚至双方只需要有一段完整的示例代码,就可以进行压缩通信,只做了功能测试,没做性能测试,大家可以先借鉴下思路。

完整代码

大致原理,就是通信双方各持有一个XML文档节点名称,属性名称的一个字典,然后发送方传输的时候用ushort代替原有的XML标签和属性名,接收方通过字典把ushort再转换成原始的元素名和属性名,这样大量不必要的重复的标签等就省去了。

代码只做本文的示例,写的比较随意,没有什么防御性和健壮性。

  1. internal enum ItemType {  
  2.     Element,  
  3.     Attritube  
  4. }  
  5. internal class XmlNodeItem {  
  6.     public string Xpath { get; set; }  
  7.     public string Text { get; set; }  
  8.     public ItemType ItemType { get; set; }  
  9.     public override string ToString() {  
  10.         return Xpath;  
  11.     }  
  12. }  
  13. internal class MyXpath {  
  14.     LinkedList<string> _node = new LinkedList<string>();  
  15.     public void AddElement(string name) {  
  16.         _node.AddLast(string.Format("/{0}", name));  
  17.     }  
  18.     public void AddAttribute(string name) {  
  19.         _node.AddLast(string.Format("/@{0}", name));  
  20.     }  
  21.     public void RemoveLastElement() {  
  22.         _node.RemoveLast();  
  23.     }  
  24.     public override string ToString() {  
  25.         StringBuilder sb = new StringBuilder();  
  26.         LinkedListNode<string> node = _node.First;  
  27.         sb.Append(node.Value);  
  28.         while ((nodenode = node.Next) != null) {  
  29.             sb.Append(node.Value);  
  30.         }  
  31.         return sb.ToString();  
  32.     }  
  33. }  
  34. class XmlZip {  
  35.     Dictionary<ushort, XmlNodeItem> _map = new Dictionary<ushort, XmlNodeItem>();  
  36.     Dictionary<string, ushort> _map2 = new Dictionary<string, ushort>();  
  37.     MyXpath _path = new MyXpath();  
  38.  
  39.     public void Init(string xmlInput) {  
  40.         StringReader sr = new StringReader(xmlInput);  
  41.         XmlReader reader = XmlReader.Create(sr);  
  42.         MemoryStream ms = new MemoryStream();  
  43.         ushort i = 1;  
  44.         while (reader.Read()) {  
  45.             switch (reader.NodeType) {  
  46.                 case XmlNodeType.Element:  
  47.                     _path.AddElement(reader.Name);  
  48.                     _map[i++] = new XmlNodeItem() {  
  49.                         Xpath = _path.ToString(),  
  50.                         Text = reader.Name,  
  51.                         ItemTypeItemType = ItemType.Element  
  52.                     };  
  53.                     if (reader.HasAttributes) {  
  54.                         reader.MoveToFirstAttribute();  
  55.                         _path.AddAttribute(reader.Name);  
  56.                         _map[i++] = new XmlNodeItem() {  
  57.                             Xpath = _path.ToString(),  
  58.                             Text = reader.Name,  
  59.                             ItemTypeItemType = ItemType.Attritube  
  60.                         };  
  61.                         _path.RemoveLastElement();  
  62.                         while (reader.MoveToNextAttribute()) {  
  63.                             _path.AddAttribute(reader.Name);  
  64.                             _map[i++] = new XmlNodeItem() {  
  65.                                 Xpath = _path.ToString(),  
  66.                                 Text = reader.Name,  
  67.                                 ItemTypeItemType = ItemType.Attritube  
  68.                             };  
  69.                             _path.RemoveLastElement();  
  70.                         }  
  71.                         reader.MoveToElement();  
  72.                     }  
  73.                     if (reader.IsEmptyElement) _path.RemoveLastElement();  
  74.                     break;  
  75.                 case XmlNodeType.EndElement:  
  76.                     _path.RemoveLastElement();  
  77.                     break;  
  78.                 default:  
  79.                     break;  
  80.             }  
  81.         }  
  82.         foreach (KeyValuePair<ushort, XmlNodeItem> pair in _map) {  
  83.             _map2[pair.Value.Xpath] = pair.Key;  
  84.         }  
  85.     }  
  86.  
  87.     public byte[] XmlToBytes(string xmlInput) {  
  88.         StringReader sr = new StringReader(xmlInput);  
  89.         XmlReader reader = XmlReader.Create(sr);  
  90.         MemoryStream ms = new MemoryStream();  
  91.         BinaryWriter bw = new BinaryWriter(ms);  
  92.         while (reader.Read()) {  
  93.             ushort index;  
  94.             byte[] bs;  
  95.             switch (reader.NodeType) {  
  96.                 case XmlNodeType.Element:  
  97.                     _path.AddElement(reader.Name);  
  98.                     if (_map2.TryGetValue(_path.ToString(), out index)) {  
  99.                         bw.Write(index);  
  100.                     }  
  101.                     if (reader.HasAttributes) {  
  102.                         reader.MoveToFirstAttribute();  
  103.                         _path.AddAttribute(reader.Name);  
  104.                         if (_map2.TryGetValue(_path.ToString(), out index)) {  
  105.                             _path.RemoveLastElement();  
  106.                             bw.Write(index);  
  107.                             bs = Encoding.UTF8.GetBytes(reader.Value);  
  108.                             bw.Write((ushort)bs.Length);  
  109.                             bw.Write(bs);  
  110.                         }  
  111.                         while (reader.MoveToNextAttribute()) {  
  112.                             _path.AddAttribute(reader.Name);  
  113.                             if (_map2.TryGetValue(_path.ToString(), out index)) {  
  114.                                 _path.RemoveLastElement();  
  115.                                 bw.Write(index);  
  116.                                 bs = Encoding.UTF8.GetBytes(reader.Value);  
  117.                                 bw.Write((ushort)bs.Length);  
  118.                                 bw.Write(bs);  
  119.                             }  
  120.                         }  
  121.                         reader.MoveToElement();  
  122.                     }  
  123.                     if (reader.IsEmptyElement) {  
  124.                         _path.RemoveLastElement();  
  125.                         bw.Write(ushort.MaxValue);  
  126.                     }  
  127.                     break;  
  128.                 case XmlNodeType.EndElement:  
  129.                     _path.RemoveLastElement();  
  130.                     bw.Write(ushort.MaxValue);  
  131.                     break;  
  132.                 case XmlNodeType.Text:  
  133.                     bw.Write((ushort)0);  
  134.                     bs = Encoding.UTF8.GetBytes(reader.Value);  
  135.                     bw.Write((ushort)bs.Length);  
  136.                     bw.Write(bs);  
  137.                     break;  
  138.                 default:  
  139.                     break;  
  140.             }  
  141.         }  
  142.         bw.Close();  
  143.         ms.Close();  
  144.         reader.Close();  
  145.         return ms.ToArray();  
  146.     }  
  147.  
  148.     public string BytesToXml(byte[] bytes) {  
  149.         MemoryStream ms = new MemoryStream(bytes);  
  150.         BinaryReader br = new BinaryReader(ms);  
  151.         StringBuilder sb = new StringBuilder();  
  152.         StringWriter sw = new StringWriter(sb);  
  153.         XmlWriterSettings settings = new XmlWriterSettings();  
  154.         settings.Indent = true;  
  155.         XmlWriter writer = XmlWriter.Create(sw, settings);  
  156.  
  157.         XmlNodeItem item;  
  158.         while (br.PeekChar() != -1) {  
  159.             ushort readFlag = br.ReadUInt16();  
  160.             int len;  
  161.             byte[] bs;  
  162.             string str;  
  163.             if (_map.TryGetValue(readFlag, out item)) {  
  164.                 if (item.ItemType == ItemType.Element)  
  165.                     writer.WriteStartElement(item.Text);  
  166.                 else if (item.ItemType == ItemType.Attritube) {  
  167.                     len = br.ReadUInt16();  
  168.                     bs = br.ReadBytes(len);  
  169.                     str = Encoding.UTF8.GetString(bs);  
  170.                     writer.WriteAttributeString(item.Text, str);  
  171.                 }  
  172.             }  
  173.             else if (readFlag == 0) {  
  174.                 len = br.ReadUInt16();  
  175.                 bs = br.ReadBytes(len);  
  176.                 str = Encoding.UTF8.GetString(bs);  
  177.                 writer.WriteString(str);  
  178.             }  
  179.             else if (readFlag == ushort.MaxValue) {  
  180.                 writer.WriteEndElement();  
  181.             }  
  182.         }  
  183.         writer.Flush();  
  184.         writer.Close();  
  185.         sw.Close();  
  186.         br.Close();  
  187.         return sb.ToString();  
  188.     }  

【编辑推荐】

  1. Servlet引擎的安装
  2. 配置Servlet开发环境
  3. 标签库中JSP Servlet调用
  4. 学习Java Servlet时遇到的小问题
  5. Servlet在session中共享链接
责任编辑:彭凡 来源: JavaEye
相关推荐

2021-09-04 16:12:33

压缩算法数据

2009-09-08 16:55:01

Linq实现XML转换

2009-09-14 15:45:28

LINQ删除XML节点

2009-09-29 15:52:26

Hibernate X

2009-06-22 11:52:00

javascriptxml

2009-08-18 17:08:50

C#编写XML文档

2009-08-24 17:24:28

C#创建XML文档

2023-10-16 19:05:20

2023-01-15 17:57:01

2011-05-18 16:02:08

XML

2009-07-28 17:34:28

ASP.NET XML

2009-05-04 10:25:36

XML.NET数据

2009-06-22 10:22:57

SQL Server

2019-04-16 11:02:10

TCPIPLinux

2014-12-17 11:19:09

H.264

2009-04-14 09:24:40

OracleXML导出

2020-02-13 17:27:31

CAPPaxos 共识算法

2011-06-07 17:14:15

关系型数据库压缩技术

2012-09-28 14:08:20

大型网站架构大型网站算法算法

2017-03-20 10:14:03

语音识别匹配算法模型
点赞
收藏

51CTO技术栈公众号