Home
Manage Your Code
Snippet: C# HTML Parser (C#)
Title: C# HTML Parser Language: C#
Description: Basic HTML Parsing. Handles HTML not in XML format. No explicit handling for empty tags, non-standards-compliant text, etc. Views: 188
Author: Richard Wattenbarger Date Added: 1/28/2008
Copy Code  
1//================================================================================

2//ContentsRetrievedEventArgs.cs

3using System;
4using System.Collections.Generic;
5using System.Text;
6using System.Text.RegularExpressions;
7using net.wattenbarger.Utilities.Regex;
8
9namespace net.wattenbarger.Utilities {
10
11	public class ContentsRetrievedEventArgs : EventArgs {
12
13		private string _contents;
14
15		protected internal ContentsRetrievedEventArgs ( string contents ) {
16			_contents = contents;
17		}
18
19		public string Contents {
20			get { return _contents; }
21		}
22
23	}
24}
25
26//================================================================================

27//HTMLParser.cs

28using System;
29using System.Collections.Generic;
30using System.IO;
31using System.Text;
32using System.Text.RegularExpressions;
33using net.wattenbarger.Utilities.Regex;
34
35namespace net.wattenbarger.Utilities {
36
37	public class HtmlParser {
38
39		private static HtmlParser _instance;
40		public event EventHandler<TagProcessedEventArgs> TagProcessed;
41		public event EventHandler<ContentsRetrievedEventArgs> 
42			ContentsRetrieved;
43
44		private HtmlParser ( ) { }
45
46		public static HtmlParser Instance {
47			get {
48				if ( _instance == null ) {
49					_instance = new HtmlParser ( );
50				}
51				return _instance;
52			}
53		}
54
55		public string ConvertAttributes ( string fragment ) {
56			AttributeRegex re = new AttributeRegex ( );
57			return re.Replace ( fragment, "${1}\"${2}\"" );
58		}
59
60		public void ParseTaggedText ( string fragment ) {
61
62			TagRegex reTag = new TagRegex ( );
63			foreach ( Match m in reTag.Matches ( fragment ) ) {
64				this.OnTagProcessed ( m );
65				this.OnContentsRetrieved ( this.GetContents ( fragment, m ) );
66			}
67		}
68
69		private string GetContents ( string fragment, Match m ) {
70			Match nextMatch = m.NextMatch ( );
71			if ( nextMatch != null && nextMatch.Index > 0 ) {
72				int startContents = m.Index + m.Length;
73				int contentsLength = nextMatch.Index - startContents;
74				return fragment.Substring ( startContents,
75					contentsLength ).Trim ( );
76			} else {
77				return "";
78			}
79		}
80
81		protected virtual void OnContentsRetrieved (
82			ContentsRetrievedEventArgs e ) {
83			if ( this.ContentsRetrieved != null ) {
84				this.ContentsRetrieved ( this, e );
85			}
86		}
87
88		private void OnContentsRetrieved ( string contents ) {
89			if ( contents.Length > 0 ) {
90				this.OnContentsRetrieved (
91					new ContentsRetrievedEventArgs ( contents ) );
92			}
93		}
94
95		protected virtual void OnTagProcessed (
96			TagProcessedEventArgs e ) {
97			if ( this.TagProcessed != null ) {
98				this.TagProcessed ( this, e );
99			}
100		}
101
102		private void OnTagProcessed ( Match m ) {
103			this.OnTagProcessed ( new TagProcessedEventArgs ( m ) );
104		}
105	}
106}
107
108
109//================================================================================

110//TagProcessedEventArgs.cs

111using System;
112using System.Collections.Generic;
113using System.Text;
114using System.Text.RegularExpressions;
115using net.wattenbarger.Utilities.Regex;
116
117namespace net.wattenbarger.Utilities {
118
119
120	public class TagProcessedEventArgs : EventArgs {
121
122		private string _tagName;
123		private string _tagText;
124		private bool _isEndTag;
125
126		internal protected TagProcessedEventArgs ( Match m ) {
127			if ( m.Success ) {
128				_tagText = m.Result ( "$1" );
129				_tagName = m.Result ( "$2" );
130				_isEndTag = ( _tagText [ 0 ] == '/' );
131			} else {
132				throw new ApplicationException (
133					"Invalid tag." );
134			}
135		}
136
137		public string TagName {
138			get { return _tagName; }
139		}
140
141		public string TagText {
142			get { return _tagText; }
143		}
144
145		public bool IsEndTag {
146			get { return _isEndTag; }
147		}
148	}
149}
150
151//================================================================================

152//RegularExpressionsDefinition.cs

153//compile by itself to a library and then add references to a build containing the

154//other three classes above

155#define DEBUG
156#define TRACE
157
158using System;
159using System.Collections.Generic;
160using System.Reflection;
161using System.Reflection.Emit;
162using System.Text;
163using System.Text.RegularExpressions;
164
165[assembly:AssemblyVersion ( "1.0.0.0" ) ]
166[assembly:AssemblyProduct ( "Regular Expressions for HTML Parser" ) ]
167
168namespace net.wattenbarger.Utilities.RegularExpressions {
169
170	public class RegularExpressionsDefinition {
171
172		private Dictionary<string, string> _patternDictionary;
173		private Dictionary<string, RegexCompilationInfo> _rciDictionary;
174
175		public RegularExpressionsDefinition ( ) {
176			_patternDictionary
177				= new Dictionary<string, string> ( );
178			_rciDictionary
179				= new Dictionary<string, RegexCompilationInfo> ( );
180		}
181
182		public void LoadPatterns ( ) {
183			_patternDictionary.Add ( "AttributeRegex", 
184				@"(\w+\=)((\w|/)+)" );
185			_patternDictionary.Add ( "TagRegex",
186				"(?s:\\<((/?\\w+).*?)\\>)" );
187			_patternDictionary.Add ( "NewLineRegex",
188				@"(\r\n)|\r|\n" );
189		}
190
191		public void LoadRegularExpressions ( ) {
192			this.LoadPatterns ( );
193			foreach ( KeyValuePair<string, string> kvp 
194				in _patternDictionary ) {
195				RegexCompilationInfo rci
196					= new RegexCompilationInfo ( kvp.Value,
197						RegexOptions.Compiled,
198						kvp.Key, "net.wattenbarger.Utilities.Regex", true );
199				_rciDictionary.Add ( kvp.Key, rci );
200			}
201		}
202
203		private CustomAttributeBuilder [ ] CreateAttributes ( ) {
204
205			List<CustomAttributeBuilder> attributeBuilders
206				= new List<CustomAttributeBuilder> ( );
207
208			CustomAttributeBuilder cab;
209
210			cab = new CustomAttributeBuilder ( 
211				typeof ( AssemblyVersionAttribute ).GetConstructor ( 
212				new Type [ ] { typeof ( string ) } ), 
213				new object [ ] { "1.0.0.0" } );
214			attributeBuilders.Add ( cab );
215
216			cab = new CustomAttributeBuilder ( 
217				typeof (AssemblyProductAttribute ).GetConstructor ( 
218				new Type [ ] { typeof ( string ) } ), 
219				new object [ ] { "HtmlParser Regular Expressions" } );
220			attributeBuilders.Add ( cab );
221			return attributeBuilders.ToArray ( );
222		}
223
224		public void CreateRegexAssembly ( ) {
225			this.LoadRegularExpressions ( );
226			List<RegexCompilationInfo> rciList
227				= new List<RegexCompilationInfo> ( _rciDictionary.Values );
228			RegexCompilationInfo [ ] compilationInfos
229				= rciList.ToArray ( );
230			AssemblyName an 
231				= new AssemblyName ( "HtmlParser.RegularExpressions" );
232			Regex.CompileToAssembly ( compilationInfos, an,
233				this.CreateAttributes());	
234		}
235
236		public static void Main ( ) {
237			RegularExpressionsDefinition red
238				= new RegularExpressionsDefinition ( );
239			red.CreateRegexAssembly ( );
240		}
241
242	}
243}
244
245