1//================================================================================
2//ContentsRetrievedEventArgs.cs
3using System;
4using System.Collections.Generic;
5using System.Text;
6using System.Text.RegularExpressions;
7using net.wattenbarger.Utilities.Regex;
8
9namespace net.wattenbarger.Utilities {
10
11 public class ContentsRetrievedEventArgs : EventArgs {
12
13 private string _contents;
14
15 protected internal ContentsRetrievedEventArgs ( string contents ) {
16 _contents = contents;
17 }
18
19 public string Contents {
20 get { return _contents; }
21 }
22
23 }
24}
25
26//================================================================================
27//HTMLParser.cs
28using System;
29using System.Collections.Generic;
30using System.IO;
31using System.Text;
32using System.Text.RegularExpressions;
33using net.wattenbarger.Utilities.Regex;
34
35namespace net.wattenbarger.Utilities {
36
37 public class HtmlParser {
38
39 private static HtmlParser _instance;
40 public event EventHandler<TagProcessedEventArgs> TagProcessed;
41 public event EventHandler<ContentsRetrievedEventArgs>
42 ContentsRetrieved;
43
44 private HtmlParser ( ) { }
45
46 public static HtmlParser Instance {
47 get {
48 if ( _instance == null ) {
49 _instance = new HtmlParser ( );
50 }
51 return _instance;
52 }
53 }
54
55 public string ConvertAttributes ( string fragment ) {
56 AttributeRegex re = new AttributeRegex ( );
57 return re.Replace ( fragment, "${1}\"${2}\"" );
58 }
59
60 public void ParseTaggedText ( string fragment ) {
61
62 TagRegex reTag = new TagRegex ( );
63 foreach ( Match m in reTag.Matches ( fragment ) ) {
64 this.OnTagProcessed ( m );
65 this.OnContentsRetrieved ( this.GetContents ( fragment, m ) );
66 }
67 }
68
69 private string GetContents ( string fragment, Match m ) {
70 Match nextMatch = m.NextMatch ( );
71 if ( nextMatch != null && nextMatch.Index > 0 ) {
72 int startContents = m.Index + m.Length;
73 int contentsLength = nextMatch.Index - startContents;
74 return fragment.Substring ( startContents,
75 contentsLength ).Trim ( );
76 } else {
77 return "";
78 }
79 }
80
81 protected virtual void OnContentsRetrieved (
82 ContentsRetrievedEventArgs e ) {
83 if ( this.ContentsRetrieved != null ) {
84 this.ContentsRetrieved ( this, e );
85 }
86 }
87
88 private void OnContentsRetrieved ( string contents ) {
89 if ( contents.Length > 0 ) {
90 this.OnContentsRetrieved (
91 new ContentsRetrievedEventArgs ( contents ) );
92 }
93 }
94
95 protected virtual void OnTagProcessed (
96 TagProcessedEventArgs e ) {
97 if ( this.TagProcessed != null ) {
98 this.TagProcessed ( this, e );
99 }
100 }
101
102 private void OnTagProcessed ( Match m ) {
103 this.OnTagProcessed ( new TagProcessedEventArgs ( m ) );
104 }
105 }
106}
107
108
109//================================================================================
110//TagProcessedEventArgs.cs
111using System;
112using System.Collections.Generic;
113using System.Text;
114using System.Text.RegularExpressions;
115using net.wattenbarger.Utilities.Regex;
116
117namespace net.wattenbarger.Utilities {
118
119
120 public class TagProcessedEventArgs : EventArgs {
121
122 private string _tagName;
123 private string _tagText;
124 private bool _isEndTag;
125
126 internal protected TagProcessedEventArgs ( Match m ) {
127 if ( m.Success ) {
128 _tagText = m.Result ( "$1" );
129 _tagName = m.Result ( "$2" );
130 _isEndTag = ( _tagText [ 0 ] == '/' );
131 } else {
132 throw new ApplicationException (
133 "Invalid tag." );
134 }
135 }
136
137 public string TagName {
138 get { return _tagName; }
139 }
140
141 public string TagText {
142 get { return _tagText; }
143 }
144
145 public bool IsEndTag {
146 get { return _isEndTag; }
147 }
148 }
149}
150
151//================================================================================
152//RegularExpressionsDefinition.cs
153//compile by itself to a library and then add references to a build containing the
154//other three classes above
155#define DEBUG
156#define TRACE
157
158using System;
159using System.Collections.Generic;
160using System.Reflection;
161using System.Reflection.Emit;
162using System.Text;
163using System.Text.RegularExpressions;
164
165[assembly:AssemblyVersion ( "1.0.0.0" ) ]
166[assembly:AssemblyProduct ( "Regular Expressions for HTML Parser" ) ]
167
168namespace net.wattenbarger.Utilities.RegularExpressions {
169
170 public class RegularExpressionsDefinition {
171
172 private Dictionary<string, string> _patternDictionary;
173 private Dictionary<string, RegexCompilationInfo> _rciDictionary;
174
175 public RegularExpressionsDefinition ( ) {
176 _patternDictionary
177 = new Dictionary<string, string> ( );
178 _rciDictionary
179 = new Dictionary<string, RegexCompilationInfo> ( );
180 }
181
182 public void LoadPatterns ( ) {
183 _patternDictionary.Add ( "AttributeRegex",
184 @"(\w+\=)((\w|/)+)" );
185 _patternDictionary.Add ( "TagRegex",
186 "(?s:\\<((/?\\w+).*?)\\>)" );
187 _patternDictionary.Add ( "NewLineRegex",
188 @"(\r\n)|\r|\n" );
189 }
190
191 public void LoadRegularExpressions ( ) {
192 this.LoadPatterns ( );
193 foreach ( KeyValuePair<string, string> kvp
194 in _patternDictionary ) {
195 RegexCompilationInfo rci
196 = new RegexCompilationInfo ( kvp.Value,
197 RegexOptions.Compiled,
198 kvp.Key, "net.wattenbarger.Utilities.Regex", true );
199 _rciDictionary.Add ( kvp.Key, rci );
200 }
201 }
202
203 private CustomAttributeBuilder [ ] CreateAttributes ( ) {
204
205 List<CustomAttributeBuilder> attributeBuilders
206 = new List<CustomAttributeBuilder> ( );
207
208 CustomAttributeBuilder cab;
209
210 cab = new CustomAttributeBuilder (
211 typeof ( AssemblyVersionAttribute ).GetConstructor (
212 new Type [ ] { typeof ( string ) } ),
213 new object [ ] { "1.0.0.0" } );
214 attributeBuilders.Add ( cab );
215
216 cab = new CustomAttributeBuilder (
217 typeof (AssemblyProductAttribute ).GetConstructor (
218 new Type [ ] { typeof ( string ) } ),
219 new object [ ] { "HtmlParser Regular Expressions" } );
220 attributeBuilders.Add ( cab );
221 return attributeBuilders.ToArray ( );
222 }
223
224 public void CreateRegexAssembly ( ) {
225 this.LoadRegularExpressions ( );
226 List<RegexCompilationInfo> rciList
227 = new List<RegexCompilationInfo> ( _rciDictionary.Values );
228 RegexCompilationInfo [ ] compilationInfos
229 = rciList.ToArray ( );
230 AssemblyName an
231 = new AssemblyName ( "HtmlParser.RegularExpressions" );
232 Regex.CompileToAssembly ( compilationInfos, an,
233 this.CreateAttributes());
234 }
235
236 public static void Main ( ) {
237 RegularExpressionsDefinition red
238 = new RegularExpressionsDefinition ( );
239 red.CreateRegexAssembly ( );
240 }
241
242 }
243}
244
245