1 //####COPYRIGHTBEGIN####
3 // ----------------------------------------------------------------------------
4 // Copyright (C) 1998, 1999, 2000 Red Hat, Inc.
6 // This program is part of the eCos host tools.
8 // This program is free software; you can redistribute it and/or modify it
9 // under the terms of the GNU General Public License as published by the Free
10 // Software Foundation; either version 2 of the License, or (at your option)
13 // This program is distributed in the hope that it will be useful, but WITHOUT
14 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 // You should have received a copy of the GNU General Public License along with
19 // this program; if not, write to the Free Software Foundation, Inc.,
20 // 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22 // ----------------------------------------------------------------------------
24 //####COPYRIGHTEND####
25 //===========================================================================
26 //#####DESCRIPTIONBEGIN####
29 // Contact(s): julians
33 // Description: HTML parser/HTML Help file generator
40 //####DESCRIPTIONEND####
42 //===========================================================================
44 #ifndef _EC_HTMLPARSER_H_
45 #define _EC_HTMLPARSER_H_
48 #pragma interface "htmlparser.cpp"
51 #include "wx/module.h"
52 #include "wx/stream.h"
56 So how are going to represent it: compare with my Latex parser.
57 This generates a hierarchy because it respects the hierarchical nature of the Latex
58 commands. However, we don't _have_ to do that, we can make it linear, e.g.
60 tag-with-attributes text-chunk end-tag-with-attributes tag-with-attributes text-chunk
62 Otherwise, we need knowledge about HTML tags to parse hierarchically. This wouldn't be hard.
63 Would need to specify which tags have open/close parts, which don't, and for which it's optional
70 * wxSimpleHtmlAttribute
71 * Representation of an attribute
74 class wxSimpleHtmlAttribute
76 friend class wxSimpleHtmlTag;
78 wxSimpleHtmlAttribute(const wxString& name, const wxString& value)
80 m_name = name; m_value = value; m_next = NULL;
84 // Write this attribute
85 void Write(wxOutputStream& stream);
88 const wxString& GetName() const { return m_name; }
89 const wxString& GetValue() const { return m_value; }
91 wxSimpleHtmlAttribute* GetNextAttribute() { return m_next; }
92 void SetNextAttribute(wxSimpleHtmlAttribute* attr) { m_next = attr; }
94 bool HasName(const wxString& name) const { return (0 == m_name.CmpNoCase(name)); }
95 bool HasValue(const wxString& val) const { return (0 == m_value.CmpNoCase(val)); }
100 wxSimpleHtmlAttribute* m_next;
106 * Representation of a tag or chunk of text
109 enum { wxSimpleHtmlTag_Text, wxSimpleHtmlTag_TopLevel, wxSimpleHtmlTag_Open, wxSimpleHtmlTag_Close, wxSimpleHtmlTag_Directive };
111 class wxSimpleHtmlTag
114 wxSimpleHtmlTag(const wxString& tagName, int tagType);
118 void ClearAttributes();
119 wxSimpleHtmlAttribute* FindAttribute(const wxString& name) const ;
120 void AppendAttribute(const wxString& name, const wxString& value);
121 void ClearChildren();
122 void AppendTag(wxSimpleHtmlTag* tag);
124 void Write(wxOutputStream& stream);
126 // Gets the text from this tag and its descendants
127 wxString GetTagText();
130 const wxString& GetName() const { return m_name; }
131 void SetName(const wxString& name) { m_name = name; }
133 int GetType() const { return m_type; }
134 void SetType(int t) { m_type = t; }
136 // If type is wxSimpleHtmlTag_Text, m_text will contain some text.
137 const wxString& GetText() const { return m_text; }
138 void SetText(const wxString& text) { m_text = text; }
140 wxSimpleHtmlAttribute* GetFirstAttribute() { return m_attributes; }
141 void SetFirstAttribute(wxSimpleHtmlAttribute* attr) { m_attributes = attr; }
143 int GetAttributeCount() const ;
144 wxSimpleHtmlAttribute* GetAttribute(int i) const ;
146 wxSimpleHtmlTag* GetChildren() const { return m_children; }
147 void SetChildren(wxSimpleHtmlTag* children) { m_children = children; }
149 wxSimpleHtmlTag* GetParent() const { return m_parent; }
150 void SetParent(wxSimpleHtmlTag* parent) { m_parent = parent; }
151 int GetChildCount() const;
152 wxSimpleHtmlTag* GetChild(int i) const;
153 wxSimpleHtmlTag* GetNext() const { return m_next; }
155 //// Convenience accessors & search functions
156 bool NameIs(const wxString& name) { return (m_name.CmpNoCase(name) == 0); }
157 bool HasAttribute(const wxString& name, const wxString& value) const;
158 bool HasAttribute(const wxString& name) const;
159 bool GetAttributeValue(wxString& value, const wxString& attrName);
161 // Search forward from this tag until we find a tag with this name & attribute
162 wxSimpleHtmlTag* FindTag(const wxString& tagName, const wxString& attrName);
164 // Gather the text until we hit the given close tag
165 bool FindTextUntilTagClose(wxString& text, const wxString& tagName);
171 wxSimpleHtmlAttribute* m_attributes;
174 wxSimpleHtmlTag* m_children;
175 wxSimpleHtmlTag* m_next; // Next sibling
176 wxSimpleHtmlTag* m_parent;
181 * Simple HTML parser, for such tasks as scanning HTML for keywords, contents, etc.
184 class wxSimpleHtmlParser : public wxObject
188 wxSimpleHtmlParser();
189 ~wxSimpleHtmlParser();
192 bool ParseFile(const wxString& filename);
193 bool ParseString(const wxString& str);
196 void Write(wxOutputStream& stream);
197 bool WriteFile(wxString& filename);
201 // Main recursive parsing function
202 bool ParseHtml(wxSimpleHtmlTag* parent);
204 wxSimpleHtmlTag* ParseTagHeader();
205 wxSimpleHtmlTag* ParseTagClose();
206 bool ParseAttributes(wxSimpleHtmlTag* tag);
207 wxSimpleHtmlTag* ParseDirective(); // e.g. <!DOCTYPE ....>
208 bool ParseComment(); // Throw away comments
209 // Plain text, up until an angled bracket
210 bool ParseText(wxString& text);
212 bool EatWhitespace(); // Throw away whitespace
213 bool EatWhitespace(int& pos); // Throw away whitespace: using 'pos'
214 bool ReadString(wxString& str, bool eatIt = FALSE);
215 bool ReadWord(wxString& str, bool eatIt = FALSE);
216 bool ReadNumber(wxString& str, bool eatIt = FALSE);
217 // Could be number, string, whatever, but read up until whitespace.
218 bool ReadLiteral(wxString& str, bool eatIt = FALSE);
225 bool IsTagStartBracket(int ch);
226 bool IsTagEndBracket(int ch);
227 bool IsWhitespace(int ch);
228 bool IsAlpha(int ch);
229 bool IsWordChar(int ch);
230 bool IsNumeric(int ch);
232 // Matches this string (case insensitive)
233 bool Matches(const wxString& tok, bool eatIt = FALSE) ;
234 bool Eof() const { return (m_pos >= m_length); }
235 bool Eof(int pos) const { return (pos >= m_length); }
237 void SetPosition(int pos) { m_pos = pos; }
241 wxSimpleHtmlTag* GetTopLevelTag() const { return m_topLevel; }
243 // Safe way of getting a character
244 int GetChar(size_t i) const;
248 wxSimpleHtmlTag* m_topLevel;
249 int m_pos; // Position in string
250 int m_length; // Length of string
251 wxString m_text; // The actual text
256 * wxSimpleHtmlTagSpec
257 * Describes a tag, and what type it is.
258 * wxSimpleHtmlModule will initialise/cleanup a list of these, one per tag type
262 class wxSimpleHtmlTagSpec : public wxObject
266 wxSimpleHtmlTagSpec(const wxString& name, int type);
269 static void AddTagSpec(wxSimpleHtmlTagSpec* spec);
273 const wxString& GetName() const { return m_name; }
274 int GetType() const { return m_type; }
281 static wxList* sm_tagSpecs;
286 * Responsible for init/cleanup of appropriate data structures
289 class wxSimpleHtmlModule : public wxModule
291 DECLARE_DYNAMIC_CLASS(wxSimpleHtmlModule)
294 wxSimpleHtmlModule() {};