From be5a70d3aa1c30d7c86d77649b747de2838566ce Mon Sep 17 00:00:00 2001 From: sienkiew Date: Thu, 21 Jul 2011 15:17:58 +0000 Subject: initial import of yapps from debian sources git-svn-id: http://svn.stsci.edu/svn/ssb/etal/exyapps/trunk@356 d34015c8-bcbb-4646-8ac8-8ba5febf221d --- LICENSE | 20 + changelog | 983 +++++++++++++++++++++++++++++++++++++++ doc/yapps2.haux | 31 ++ doc/yapps2.html | 1206 ++++++++++++++++++++++++++++++++++++++++++++++++ doc/yapps2.htoc | 36 ++ doc/yapps2.tex | 1246 ++++++++++++++++++++++++++++++++++++++++++++++++++ examples/calc.g | 64 +++ examples/expr.g | 21 + examples/lisp.g | 13 + examples/notes | 44 ++ examples/xml.g | 66 +++ setup.py | 42 ++ test/empty_clauses.g | 10 + test/line_numbers.g | 10 + test/option.g | 17 + yapps/__init__.py | 1 + yapps/grammar.py | 211 +++++++++ yapps/parsetree.py | 673 +++++++++++++++++++++++++++ yapps/runtime.py | 442 ++++++++++++++++++ yapps2.py | 113 +++++ yapps_grammar.g | 121 +++++ 21 files changed, 5370 insertions(+) create mode 100644 LICENSE create mode 100644 changelog create mode 100644 doc/yapps2.haux create mode 100644 doc/yapps2.html create mode 100644 doc/yapps2.htoc create mode 100644 doc/yapps2.tex create mode 100644 examples/calc.g create mode 100644 examples/expr.g create mode 100644 examples/lisp.g create mode 100644 examples/notes create mode 100644 examples/xml.g create mode 100644 setup.py create mode 100644 test/empty_clauses.g create mode 100644 test/line_numbers.g create mode 100644 test/option.g create mode 100644 yapps/__init__.py create mode 100644 yapps/grammar.py create mode 100644 yapps/parsetree.py create mode 100644 yapps/runtime.py create mode 100755 yapps2.py create mode 100644 yapps_grammar.g diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..87fd179 --- /dev/null +++ b/LICENSE @@ -0,0 +1,20 @@ + + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/changelog b/changelog new file mode 100644 index 0000000..4fdf12e --- /dev/null +++ b/changelog @@ -0,0 +1,983 @@ +ChangeSet + 1.38 05/01/22 19:36:32 smurf@smurf.noris.de +2 -0 + Add option to limit backtrace depth on syntax errors. + + yapps/runtime.py + 1.15 05/01/22 19:36:31 smurf@smurf.noris.de +5 -1 + Add option to limit backtrace depth on syntax errors. + + debian/changelog + 1.20 05/01/22 19:36:31 smurf@smurf.noris.de +2 -1 + Add option to limit backtrace depth on syntax errors. + +ChangeSet + 1.37 05/01/22 03:39:56 smurf@smurf.noris.de +2 -0 + Fix recursive includes. + + yapps/runtime.py + 1.14 05/01/22 03:39:54 smurf@smurf.noris.de +395 -381 + Fix recursive includes. + + debian/changelog + 1.19 05/01/22 03:39:54 smurf@smurf.noris.de +6 -0 + Fix recursive includes. + +ChangeSet + 1.36 04/12/23 23:49:52 smurf@smurf.noris.de +1 -0 + Brown paper bag -- fix Python 2.4 stuff. + + debian/changelog + 1.18 04/12/23 23:49:52 smurf@smurf.noris.de +6 -0 + Brown paper bag -- fix Python 2.4 stuff. + +ChangeSet + 1.35 04/12/23 21:00:34 smurf@smurf.noris.de +1 -0 + typo + + debian/control + 1.10 04/12/23 21:00:33 smurf@smurf.noris.de +1 -1 + typo + +ChangeSet + 1.34 04/12/12 20:22:54 smurf@smurf.noris.de +2 -0 + Add support for Python 2.4 + + debian/control + 1.9 04/12/12 20:22:52 smurf@smurf.noris.de +1 -1 + Add support for Python 2.4 + + debian/changelog + 1.17 04/12/12 20:22:52 smurf@smurf.noris.de +6 -0 + Add support for Python 2.4 + +ChangeSet + 1.33 04/09/23 11:24:16 smurf@smurf.noris.de +3 -0 + update documentation: + - toss hyphens + - document extensions + + doc/yapps2.tex + 1.3 04/09/23 11:24:16 smurf@smurf.noris.de +21 -0 + add a Debian Extensions section + + debian/yapps.1 + 1.2 04/09/23 11:24:16 smurf@smurf.noris.de +14 -9 + escape more hyphens (i.e., all the rest) + + debian/changelog + 1.16 04/09/23 11:24:16 smurf@smurf.noris.de +2 -0 + update documentation: + - toss hyphens + - document extensions + +ChangeSet + 1.32 04/09/23 11:23:24 smurf@smurf.noris.de +2 -0 + turn off triggers + + BitKeeper/triggers/pre-commit.upversion + 1.2 04/09/23 11:23:24 smurf@smurf.noris.de +2 -0 + off + + BitKeeper/triggers/post-commit.changelog + 1.2 04/09/23 11:23:24 smurf@smurf.noris.de +2 -0 + off + +ChangeSet + 1.31 04/09/23 10:55:24 smurf@smurf.noris.de +1 -0 + ignore new package's files + + BitKeeper/etc/ignore + 1.17 04/09/23 10:55:23 smurf@smurf.noris.de +1 -0 + added debian/yapps2-runtime/* + + debian/yapps2-runtime.README + 1.1 04/09/23 10:50:33 smurf@smurf.noris.de +11 -0 + +ChangeSet + 1.30 04/09/23 10:50:33 smurf@smurf.noris.de +8 -0 + split off runtime to its own package + document the fact that I can't use the original runtime + + debian/yapps2-runtime.dirs + 1.6 04/09/23 10:50:33 smurf@smurf.noris.de +2 -4 + split off runtime + + debian/yapps2-runtime.README + 1.0 04/09/23 10:50:33 smurf@smurf.noris.de +0 -0 + BitKeeper file /daten/src/debian/python_yapps/debian/yapps2-runtime.README + + debian/rules + 1.5 04/09/23 10:50:33 smurf@smurf.noris.de +4 -1 + move runtime files to their own package + + debian/control + 1.8 04/09/23 10:50:33 smurf@smurf.noris.de +14 -1 + split off runtime to its own package + + debian/changelog + 1.15 04/09/23 10:50:33 smurf@smurf.noris.de +9 -0 + document package split + + debian/README + 1.2 04/09/23 10:50:33 smurf@smurf.noris.de +21 -4 + Updated for package split + + debian/yapps2.docs + 1.3 04/09/23 10:31:15 smurf@smurf.noris.de +0 -0 + Rename: debian/docs -> debian/yapps2.docs + + debian/yapps2-runtime.dirs + 1.5 04/09/23 10:30:48 smurf@smurf.noris.de +0 -0 + bk cp yapps2.dirs yapps2-runtime.dirs + + debian/yapps2.dirs + 1.4 04/09/23 10:30:42 smurf@smurf.noris.de +0 -0 + Rename: debian/dirs -> debian/yapps2.dirs + + debian/yapps2.dirs + 1.4 04/09/23 10:30:42 smurf@smurf.noris.de +0 -0 + Rename: debian/dirs -> debian/yapps2.dirs + +ChangeSet + 1.29 04/07/19 09:30:22 smurf@smurf.noris.de +5 -0 + latex2html => hevea + + debian/yapps2.doc-base + 1.2 04/07/19 09:30:21 smurf@smurf.noris.de +2 -2 + latex2html => hevea + + debian/rules + 1.4 04/07/19 09:30:21 smurf@smurf.noris.de +4 -2 + latex2html => hevea + + debian/control + 1.7 04/07/19 09:30:21 smurf@smurf.noris.de +1 -1 + latex2html => hevea + + debian/changelog + 1.14 04/07/19 09:30:21 smurf@smurf.noris.de +6 -0 + latex2html => hevea + + BitKeeper/etc/ignore + 1.16 04/07/19 09:30:06 smurf@smurf.noris.de +1 -0 + added doc/yapps2.haux + + BitKeeper/etc/ignore + 1.15 04/07/19 09:29:55 smurf@smurf.noris.de +1 -0 + added doc/yapps2.ht* + +ChangeSet + 1.28 04/07/12 09:35:59 smurf@smurf.noris.de +2 -0 + Build-Depend on python. + + debian/control + 1.6 04/07/12 09:35:58 smurf@smurf.noris.de +1 -1 + Build-Depend on python. + + debian/changelog + 1.13 04/07/12 09:35:58 smurf@smurf.noris.de +6 -0 + doc + +ChangeSet + 1.27 04/05/16 22:02:40 smurf@smurf.noris.de +2 -0 + ship "empty" file + + yapps/__init__.py + 1.2 04/05/16 22:02:39 smurf@smurf.noris.de +1 -0 + ship "empty" file + + debian/changelog + 1.12 04/05/16 22:02:39 smurf@smurf.noris.de +2 -1 + doc + +ChangeSet + 1.26 04/05/16 22:01:42 smurf@smurf.noris.de +2 -0 + Typo (made large file handling slow) + + yapps/runtime.py + 1.13 04/05/16 22:01:42 smurf@smurf.noris.de +1 -1 + Typo + + debian/changelog + 1.11 04/05/16 22:01:42 smurf@smurf.noris.de +6 -0 + Version 2.1.1-11 + +ChangeSet + 1.25 04/05/14 12:25:51 smurf@smurf.noris.de +1 -0 + exporter: test was in wrong dir + + debian/exporter + 1.3 04/05/14 12:25:51 smurf@smurf.noris.de +1 -0 + wrong dir + +ChangeSet + 1.24 04/05/14 12:20:04 smurf@smurf.noris.de +1 -0 + Clean up external source before generating a diff + + debian/exporter + 1.2 04/05/14 12:20:04 smurf@smurf.noris.de +7 -0 + Clean up external source before generating a diff + +ChangeSet + 1.23 04/05/14 12:14:34 smurf@linux.smurf.noris.de +13 -0 + Documentation update: + build and install HTML documentation from LaTex source + + debian/changelog + 1.10 04/05/14 12:14:33 smurf@linux.smurf.noris.de +7 -0 + Version 2.1.1-10 + + debian/yapps2.doc-base + 1.1 04/05/14 12:14:32 smurf@smurf.noris.de +13 -0 + + yapps_grammar.g + 1.5 04/05/14 12:14:31 smurf@smurf.noris.de +1 -0 + add my copyright notice + + yapps/runtime.py + 1.12 04/05/14 12:14:31 smurf@smurf.noris.de +1 -0 + add my copyright notice + + debian/yapps2.doc-base + 1.0 04/05/14 12:14:31 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/yapps2.doc-base + + debian/rules + 1.3 04/05/14 12:14:31 smurf@smurf.noris.de +5 -5 + gernerate and install html documentation + don't install LICENSE file + + debian/yapps2-runtime.dirs + 1.3 04/05/14 12:14:30 smurf@smurf.noris.de +1 -2 + drop overrides + add doc dir +html + + debian/docs + 1.2 04/05/14 12:14:30 smurf@smurf.noris.de +1 -1 + install latex documentation + + debian/dirs + 1.3 04/05/14 12:14:30 smurf@smurf.noris.de +1 -2 + drop overrides + add doc dir +html + + debian/copyright + 1.3 04/05/14 12:14:30 smurf@smurf.noris.de +21 -3 + include license here instead of installing a LICENSE file + + debian/control + 1.5 04/05/14 12:14:30 smurf@smurf.noris.de +6 -6 + Dep on latex2html + indent list + + BitKeeper/etc/ignore + 1.14 04/05/14 12:06:12 smurf@smurf.noris.de +1 -0 + added doc/yapps2/* + + BitKeeper/etc/ignore + 1.13 04/05/14 12:06:07 smurf@smurf.noris.de +3 -0 + added debian/yapps2/* debian/*.substvars debian/*.debhelper + + BitKeeper/deleted/.del-overrides.lintian~19711613dc4ce90f + 1.3 04/05/14 11:51:33 smurf@smurf.noris.de +0 -0 + Delete: debian/overrides.lintian + + BitKeeper/deleted/.del-overrides.linda~b0c6fa08da170a16 + 1.2 04/05/14 11:51:33 smurf@smurf.noris.de +0 -0 + Delete: debian/overrides.linda + + doc/yapps2.tex + 1.2 04/05/14 11:34:34 smurf@smurf.noris.de +0 -0 + Rename: yapps2.tex -> doc/yapps2.tex + +ChangeSet + 1.22 04/05/14 11:33:27 smurf@smurf.noris.de +1 -0 + Merge bk://server/public/python_yapps + into smurf.noris.de:/usr/local/src/misc/yapps + + BitKeeper/deleted/.del-logging_ok~530b65bc14e5cc7c + 1.2 04/05/14 11:33:26 smurf@smurf.noris.de +0 -0 + 'Auto converge rename' + + BitKeeper/etc/logging_ok + 1.1 04/05/14 11:33:13 smurf@smurf.noris.de +1 -0 + +ChangeSet + 1.4.1.1 04/05/14 11:33:13 smurf@linux.smurf.noris.de +2 -0 + Added tex documentation from yapps-2.0.4. + + BitKeeper/etc/logging_ok + 1.0 04/05/14 11:33:13 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/BitKeeper/etc/logging_ok + + yapps2.tex + 1.1 04/05/14 11:33:10 smurf@smurf.noris.de +1225 -0 + + yapps2.tex + 1.0 04/05/14 11:33:10 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/yapps2.tex + +ChangeSet + 1.21 04/05/14 11:31:18 smurf@linux.smurf.noris.de +7 -0 + Renamed the package to "yapps2". + + setup.py + 1.2 04/05/14 11:31:17 smurf@smurf.noris.de +17 -2 + Fixed name + Updated long description + + debian/yapps2-runtime.dirs + 1.2 04/05/14 11:31:17 smurf@smurf.noris.de +1 -1 + rename: python-yapps => yapps2 + + debian/rules + 1.2 04/05/14 11:31:17 smurf@smurf.noris.de +8 -8 + rename: python-yapps => yapps2 + + debian/overrides.lintian + 1.2 04/05/14 11:31:17 smurf@smurf.noris.de +1 -1 + rename: python-yapps => yapps2 + + debian/dirs + 1.2 04/05/14 11:31:17 smurf@smurf.noris.de +1 -1 + rename: python-yapps => yapps2 + + debian/copyright + 1.2 04/05/14 11:31:16 smurf@smurf.noris.de +11 -3 + Added pointer to original source + + debian/control + 1.4 04/05/14 11:31:16 smurf@smurf.noris.de +18 -10 + rename: python-yapps => yapps2 + + debian/changelog + 1.9 04/05/14 11:31:16 smurf@smurf.noris.de +13 -13 + Cleanup + +ChangeSet + 1.20 03/12/31 14:00:42 smurf@linux.smurf.noris.de +2 -0 + require python-dev because of distutils + + debian/changelog + 1.8 03/12/31 14:00:42 smurf@linux.smurf.noris.de +6 -0 + Version 2.1.1-8 + + debian/control + 1.3 03/12/31 14:00:40 smurf@smurf.noris.de +1 -1 + require python-dev because of distutils + +ChangeSet + 1.19 03/12/31 13:57:38 smurf@linux.smurf.noris.de +2 -0 + Change yapps.py t exit 1 on failure to parse + + debian/changelog + 1.7 03/12/31 13:57:38 smurf@linux.smurf.noris.de +6 -0 + Version 2.1.1-7 + + yapps2.py + 1.6 03/12/31 13:57:37 smurf@smurf.noris.de +3 -2 + exit 1 on error + +ChangeSet + 1.18 03/12/30 15:36:56 smurf@linux.smurf.noris.de +2 -0 + Update to 3.6.1, use build-depends-indep. + + debian/changelog + 1.6 03/12/30 15:36:56 smurf@linux.smurf.noris.de +6 -0 + Version 2.1.1-6 + + debian/control + 1.2 03/12/30 15:36:55 smurf@smurf.noris.de +2 -2 + Update to 3.6.1, use build-depends-indep. + +ChangeSet + 1.17 03/12/30 15:33:19 smurf@linux.smurf.noris.de +2 -0 + Add some notes. + + debian/changelog + 1.5 03/12/30 15:33:19 smurf@linux.smurf.noris.de +6 -0 + Version 2.1.1-5 + + examples/notes + 1.1 03/12/30 15:33:18 smurf@smurf.noris.de +44 -0 + + examples/notes + 1.0 03/12/30 15:33:17 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/examples/notes + +ChangeSet + 1.16 03/12/30 15:30:05 smurf@linux.smurf.noris.de +2 -0 + Correctly report syntax errors without line number + + debian/changelog + 1.4 03/12/30 15:30:05 smurf@linux.smurf.noris.de +6 -0 + Version 2.1.1-4 + + yapps/runtime.py + 1.11 03/12/30 15:30:04 smurf@smurf.noris.de +6 -2 + Report syntax errors with no line number + +ChangeSet + 1.15 03/12/30 14:02:37 smurf@linux.smurf.noris.de +2 -0 + Repair ignored-pattern upcall. + + debian/changelog + 1.3 03/12/30 14:02:37 smurf@linux.smurf.noris.de +6 -0 + Version 2.1.1-3 + + yapps/runtime.py + 1.10 03/12/30 14:02:36 smurf@smurf.noris.de +4 -2 + Repair ignore upcall. + +ChangeSet + 1.14 03/12/30 13:30:14 smurf@linux.smurf.noris.de +2 -0 + runtime: fix error reporting + + debian/changelog + 1.2 03/12/30 13:30:14 smurf@linux.smurf.noris.de +6 -0 + Version 2.1.1-2 + + yapps/runtime.py + 1.9 03/12/30 13:30:12 smurf@smurf.noris.de +9 -9 + Fix error reporting + +ChangeSet + 1.13 03/12/30 12:25:29 smurf@linux.smurf.noris.de +2 -0 + replace runtime grammar + yapps_grammar.g: delete shebang line, fix imports + + yapps_grammar.g + 1.4 03/12/30 12:25:28 smurf@smurf.noris.de +1 -3 + fix import + delete shebang line + + yapps/grammar.py + 1.11 03/12/30 12:25:28 smurf@smurf.noris.de +49 -66 + replace runtime grammar + +ChangeSet + 1.12 03/12/30 11:51:26 smurf@linux.smurf.noris.de +19 -0 + D + + setup.py + 1.1 03/12/30 11:51:25 smurf@smurf.noris.de +27 -0 + + debian/yapps.1 + 1.1 03/12/30 11:51:25 smurf@smurf.noris.de +58 -0 + + debian/rules + 1.1 03/12/30 11:51:25 smurf@smurf.noris.de +91 -0 + + debian/overrides.lintian + 1.1 03/12/30 11:51:25 smurf@smurf.noris.de +1 -0 + + setup.py + 1.0 03/12/30 11:51:25 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/setup.py + + debian/yapps2-runtime.dirs + 1.1 03/12/30 11:51:24 smurf@smurf.noris.de +5 -0 + + debian/yapps.1 + 1.0 03/12/30 11:51:25 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/yapps.1 + + debian/rules + 1.0 03/12/30 11:51:25 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/rules + + debian/overrides.lintian + 1.0 03/12/30 11:51:25 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/overrides.lintian + + debian/overrides.linda + 1.1 03/12/30 11:51:24 smurf@smurf.noris.de +3 -0 + + debian/exporter + 1.1 03/12/30 11:51:24 smurf@smurf.noris.de +10 -0 + + debian/docs + 1.1 03/12/30 11:51:24 smurf@smurf.noris.de +3 -0 + + debian/dirs + 1.1 03/12/30 11:51:24 smurf@smurf.noris.de +5 -0 + + debian/copyright + 1.1 03/12/30 11:51:24 smurf@smurf.noris.de +15 -0 + + debian/control + 1.1 03/12/30 11:51:24 smurf@smurf.noris.de +19 -0 + + debian/compat + 1.1 03/12/30 11:51:24 smurf@smurf.noris.de +1 -0 + + debian/README + 1.1 03/12/30 11:51:24 smurf@smurf.noris.de +6 -0 + + yapps_grammar.g + 1.3 03/12/30 11:51:24 smurf@smurf.noris.de +0 -1 + Make the scanner context-sensitive. Works better. + + yapps2.py + 1.5 03/12/30 11:51:24 smurf@smurf.noris.de +1 -1 + Fix path + + yapps/runtime.py + 1.8 03/12/30 11:51:24 smurf@smurf.noris.de +0 -1 + Regularize header + + yapps/parsetree.py + 1.7 03/12/30 11:51:24 smurf@smurf.noris.de +0 -2 + Drop shebang line, this is not a program. + + yapps/grammar.py + 1.10 03/12/30 11:51:24 smurf@smurf.noris.de +0 -2 + Drop shebang line, this is not a program. + + debian/yapps2-runtime.dirs + 1.0 03/12/30 11:51:24 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/dirs + + debian/overrides.linda + 1.0 03/12/30 11:51:24 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/overrides.linda + + debian/exporter + 1.0 03/12/30 11:51:24 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/exporter + + debian/docs + 1.0 03/12/30 11:51:24 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/docs + + debian/dirs + 1.0 03/12/30 11:51:24 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/dirs + + debian/copyright + 1.0 03/12/30 11:51:24 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/copyright + + debian/control + 1.0 03/12/30 11:51:24 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/control + + debian/compat + 1.0 03/12/30 11:51:24 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/compat + + debian/README + 1.0 03/12/30 11:51:24 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/README + + debian/changelog + 1.1 03/12/30 11:41:14 smurf@smurf.noris.de +15 -0 + + debian/changelog + 1.0 03/12/30 11:41:14 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/changelog + + BitKeeper/etc/ignore + 1.12 03/12/30 11:40:56 smurf@smurf.noris.de +1 -0 + added changelog + +ChangeSet + 1.11 03/12/30 11:23:09 smurf@linux.smurf.noris.de +5 -0 + Rewrote imports et al. to create+use a real "yapps" module + + yapps/__init__.py + 1.1 03/12/30 11:23:08 smurf@smurf.noris.de +0 -0 + + yapps2.py + 1.4 03/12/30 11:23:08 smurf@smurf.noris.de +3 -3 + Refactor to use reasonable "yapps" module + + yapps/runtime.py + 1.7 03/12/30 11:23:08 smurf@smurf.noris.de +0 -0 + Rename: yappsrt.py -> yapps/runtime.py + + yapps/parsetree.py + 1.6 03/12/30 11:23:08 smurf@smurf.noris.de +10 -10 + Refactor to use reasonable "yapps" module + Rename: parsetree.py -> yapps/parsetree.py + + yapps/__init__.py + 1.0 03/12/30 11:23:08 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/yapps/__init__.py + + yapps/grammar.py + 1.9 03/12/30 11:23:07 smurf@smurf.noris.de +16 -16 + Refactor to use reasonable "yapps" module + Rename: grammar.py -> yapps/grammar.py + + BitKeeper/etc/ignore + 1.11 03/12/30 11:22:15 smurf@smurf.noris.de +4 -0 + added build/* debian/python-yapps/* debian/*.debhelper debian/*.substvars + +ChangeSet + 1.10 03/12/29 22:10:59 smurf@linux.smurf.noris.de +1 -0 + Added context-insensitive-scanner end test + to a couple of composite grammar statements. + + It's probably overkill to do that with all statements..? + + parsetree.py + 1.5 03/12/29 22:10:58 smurf@smurf.noris.de +19 -5 + Added context-insensitive-scanner end test + to a couple of composite statements. + + It's probably overkill to do that with all statements..? + +ChangeSet + 1.9 03/12/29 22:05:00 smurf@linux.smurf.noris.de +1 -0 + yappsrt.py: Bugfix: stored token type + + yappsrt.py + 1.6 03/12/29 22:04:59 smurf@smurf.noris.de +1 -1 + Bugfix: stored token type + +ChangeSet + 1.8 03/12/29 21:42:41 smurf@linux.smurf.noris.de +4 -0 + Pass init arguments to the scanner + Simplify stuff a bit + + yappsrt.py + 1.5 03/12/29 21:42:40 smurf@smurf.noris.de +16 -8 + Fix line counting + simplify pattern length determination + + yapps2.py + 1.3 03/12/29 21:42:40 smurf@smurf.noris.de +3 -2 + Pass filename to scanner + + parsetree.py + 1.4 03/12/29 21:42:40 smurf@smurf.noris.de +2 -2 + Pass init arguments to the scanner + + grammar.py + 1.8 03/12/29 21:42:40 smurf@smurf.noris.de +3 -2 + Update for changed yapps2.py + +ChangeSet + 1.7 03/12/29 20:37:55 smurf@linux.smurf.noris.de +6 -0 + Cleanup ignored-symbol commands + Fix including and error reporting + + yappsrt.py + 1.4 03/12/29 20:37:54 smurf@smurf.noris.de +88 -52 + cleanup ignored symbol handling + refactor _scan and _peek: move to Scanner + generate pseudo filenames for inline documents + accept context for error handling + + yapps_grammar.g + 1.2 03/12/29 20:37:54 smurf@smurf.noris.de +4 -1 + Cleanup statements for ignored symbols + + yapps2.py + 1.2 03/12/29 20:37:54 smurf@smurf.noris.de +1 -1 + Setup line numbers correctly + + parsetree.py + 1.3 03/12/29 20:37:54 smurf@smurf.noris.de +22 -6 + Ignored-symbol handling extended + Pass context to scanners + + grammar.py + 1.7 03/12/29 20:37:54 smurf@smurf.noris.de +1 -1 + Use a hash for ignored stuff + + examples/calc.g + 1.3 03/12/29 20:37:54 smurf@smurf.noris.de +7 -4 + Cleanup include handling: use an ignored token + +ChangeSet + 1.6 03/12/29 18:16:19 smurf@linux.smurf.noris.de +6 -0 + Reproduce current grammar file + + One problem with attribute remains. + + yapps_grammar.g + 1.1 03/12/29 18:16:18 smurf@smurf.noris.de +120 -0 + + yappsrt.py + 1.3 03/12/29 18:16:18 smurf@smurf.noris.de +5 -5 + charpos => pos + + yapps_grammar.g + 1.0 03/12/29 18:16:18 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/yapps_grammar.g + + BitKeeper/etc/ignore + 1.10 03/12/29 18:15:23 smurf@smurf.noris.de +1 -0 + added *.swp + + BitKeeper/etc/ignore + 1.9 03/12/29 18:15:02 smurf@smurf.noris.de +1 -0 + added yapps_grammar.py + + BitKeeper/deleted/.del-yapps_grammar.py~276aa227aa238250 + 1.4 03/12/29 18:14:35 smurf@smurf.noris.de +0 -0 + Delete: yapps_grammar.py + + grammar.py + 1.6 03/12/29 18:14:17 smurf@smurf.noris.de +0 -0 + Rename: BitKeeper/deleted/.del-grammar.py~46b22024b3b85127 -> grammar.py + + BitKeeper/deleted/.del-grammar.py~46b22024b3b85127 + 1.5 03/12/29 18:13:11 smurf@smurf.noris.de +0 -0 + Delete: grammar.py + + yapps_grammar.py + 1.3 03/12/29 18:13:04 smurf@smurf.noris.de +0 -0 + Rename: BitKeeper/deleted/.del-yapps_grammar.py~276aa227aa238250 -> yapps_grammar.py + + grammar.py + 1.4 03/12/29 18:12:51 smurf@smurf.noris.de +0 -0 + Rename: BitKeeper/deleted/.del-grammar.py~46b22024b3b85127 -> grammar.py + + BitKeeper/deleted/.del-grammar.py~46b22024b3b85127 + 1.3 03/12/29 18:12:30 smurf@smurf.noris.de +19 -20 + Delete: grammar.py + + BitKeeper/etc/ignore + 1.8 03/12/29 17:15:10 smurf@smurf.noris.de +3 -0 + added *-stamp debian/files debian/tmp/* + + BitKeeper/etc/ignore + 1.7 03/12/29 17:15:08 smurf@smurf.noris.de +3 -0 + added *-stamp debian/files debian/tmp/* + + BitKeeper/etc/ignore + 1.6 03/12/29 17:14:00 smurf@smurf.noris.de +3 -0 + added *-stamp debian/files debian/tmp/* + + BitKeeper/triggers/post-commit.changelog + 1.1 03/12/29 17:13:58 smurf@smurf.noris.de +3 -0 + + BitKeeper/triggers/post-commit.changelog + 1.0 03/12/29 17:13:58 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/BitKeeper/triggers/post-commit.changelog + + BitKeeper/etc/logging_ok + 1.1 03/12/29 17:13:41 smurf@smurf.noris.de +1 -0 + +ChangeSet + 1.5 03/12/29 17:13:41 smurf@linux.smurf.noris.de +7 -0 + Major enhancements: + - Use a Token object + - Allow incremental reading from a file + - Allow stacking of inputs (#include, whatever) + - Remember line numbers + - Refactor print_line_with_error into the Scanner object + + BitKeeper/etc/logging_ok + 1.0 03/12/29 17:13:41 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/BitKeeper/etc/logging_ok + + yappsrt.py + 1.2 03/12/29 17:13:38 smurf@smurf.noris.de +219 -141 + Major enhancements: + - Use a Token object + - Allow incremental reading from a file + - Allow stacking of inputs (#include, whatever) + - Remember line numbers + - Refactor print_line_with_error into the Scanner object + + parsetree.py + 1.2 03/12/29 17:13:38 smurf@smurf.noris.de +2 -2 + don't pass pos explicitly + + grammar.py + 1.2 03/12/29 17:13:38 smurf@smurf.noris.de +15 -19 + Cleanup + + generated from non-available file! + + examples/calc.g + 1.2 03/12/29 17:13:38 smurf@smurf.noris.de +5 -2 + cleanup (strip, atoi) + + allow reading in of expressions via stacking (TEST) + + BitKeeper/etc/ignore + 1.5 03/12/29 17:10:56 smurf@smurf.noris.de +1 -0 + added *.pyc + + BitKeeper/etc/ignore + 1.4 03/12/29 17:10:51 smurf@smurf.noris.de +1 -0 + added test/*.py + + BitKeeper/etc/ignore + 1.3 03/12/29 17:10:46 smurf@smurf.noris.de +1 -0 + added examples/*.py + + BitKeeper/deleted/.del-yapps_grammar.py~276aa227aa238250 + 1.2 03/12/29 13:58:49 smurf@smurf.noris.de +0 -0 + Delete: yapps_grammar.py + +ChangeSet + 1.4 03/08/28 00:22:57 ?@smurf.noris.de +15 -0 + Version 2.1.1 + +ChangeSet + 1.3 03/08/28 00:22:57 ?@smurf.noris.de +1 -0 + CVS-Ignore + +ChangeSet + 1.2 03/12/29 12:56:52 smurf@smurf.noris.de +1 -0 + Versionsnummern-Updateskript + + BitKeeper/etc/ignore + 1.2 03/08/28 00:22:57 ?@smurf.noris.de +3 -0 + added CVS .cvsignore CVSROOT + +ChangeSet + 1.1 03/12/29 12:56:52 smurf@smurf.noris.de +2 -0 + Initial repository create + + BitKeeper/triggers/pre-commit.upversion + 1.1 03/12/29 12:56:52 smurf@smurf.noris.de +3 -0 + + BitKeeper/etc/ignore + 1.1 03/12/29 12:56:52 smurf@smurf.noris.de +2 -0 + + BitKeeper/etc/config + 1.1 03/12/29 12:56:52 smurf@smurf.noris.de +13 -0 + +ChangeSet + 1.0 03/12/29 12:56:52 smurf@smurf.noris.de +0 -0 + BitKeeper file /tmp/b.s.20059/ChangeSet + + BitKeeper/triggers/pre-commit.upversion + 1.0 03/12/29 12:56:52 smurf@smurf.noris.de +0 -0 + BitKeeper file /tmp/b.s.20059/BitKeeper/triggers/pre-commit.upversion + + BitKeeper/etc/ignore + 1.0 03/12/29 12:56:52 smurf@smurf.noris.de +0 -0 + BitKeeper file /tmp/b.s.20059/BitKeeper/etc/ignore + + BitKeeper/etc/config + 1.0 03/12/29 12:56:52 smurf@smurf.noris.de +0 -0 + BitKeeper file /tmp/b.s.20059/BitKeeper/etc/config + + yappsrt.py + 1.1 03/08/27 23:12:19 ?@smurf.noris.de +296 -0 + New:Version 2.1.1 + + yapps_grammar.py + 1.1 03/08/28 00:22:32 ?@smurf.noris.de +234 -0 + New:Version 2.1.1 + + yapps2.py + 1.1 03/08/12 20:25:55 ?@smurf.noris.de +111 -0 + New:Version 2.1.1 + + test/option.g + 1.1 03/08/11 20:43:22 ?@smurf.noris.de +17 -0 + New:Version 2.1.1 + + test/line_numbers.g + 1.1 03/08/28 00:22:56 ?@smurf.noris.de +10 -0 + New:Version 2.1.1 + + test/empty_clauses.g + 1.1 03/08/27 23:48:11 ?@smurf.noris.de +10 -0 + New:Version 2.1.1 + + parsetree.py + 1.1 03/08/28 00:18:14 ?@smurf.noris.de +645 -0 + New:Version 2.1.1 + + grammar.py + 1.1 03/08/28 00:16:28 ?@smurf.noris.de +234 -0 + New:Version 2.1.1 + + examples/xml.g + 1.1 03/08/27 20:53:39 ?@smurf.noris.de +66 -0 + New:Version 2.1.1 + + examples/lisp.g + 1.1 03/08/11 20:18:18 ?@smurf.noris.de +13 -0 + New:Version 2.1.1 + + examples/expr.g + 1.1 03/08/08 06:47:58 ?@smurf.noris.de +21 -0 + New:Version 2.1.1 + + examples/calc.g + 1.1 03/08/11 20:17:09 ?@smurf.noris.de +58 -0 + New:Version 2.1.1 + + NOTES + 1.1 03/08/12 18:59:41 ?@smurf.noris.de +78 -0 + New:Version 2.1.1 + + LICENSE + 1.1 03/08/11 19:41:27 ?@smurf.noris.de +20 -0 + New:Version 2.1.1 + + ChangeLog + 1.1 03/08/28 00:22:16 ?@smurf.noris.de +108 -0 + New:Version 2.1.1 + + yappsrt.py + 1.0 03/08/27 23:12:19 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/yappsrt.py + + yapps_grammar.py + 1.0 03/08/28 00:22:32 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/yapps_grammar.py + + yapps2.py + 1.0 03/08/12 20:25:55 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/yapps2.py + + test/option.g + 1.0 03/08/11 20:43:22 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/test/option.g + + test/line_numbers.g + 1.0 03/08/28 00:22:56 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/test/line_numbers.g + + test/empty_clauses.g + 1.0 03/08/27 23:48:11 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/test/empty_clauses.g + + parsetree.py + 1.0 03/08/28 00:18:14 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/parsetree.py + + grammar.py + 1.0 03/08/28 00:16:28 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/grammar.py + + examples/xml.g + 1.0 03/08/27 20:53:39 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/examples/xml.g + + examples/lisp.g + 1.0 03/08/11 20:18:18 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/examples/lisp.g + + examples/expr.g + 1.0 03/08/08 06:47:58 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/examples/expr.g + + examples/calc.g + 1.0 03/08/11 20:17:09 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/examples/calc.g + + NOTES + 1.0 03/08/12 18:59:41 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/NOTES + + LICENSE + 1.0 03/08/11 19:41:27 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/LICENSE + + ChangeLog + 1.0 03/08/28 00:22:16 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/ChangeLog + diff --git a/doc/yapps2.haux b/doc/yapps2.haux new file mode 100644 index 0000000..ad51bcd --- /dev/null +++ b/doc/yapps2.haux @@ -0,0 +1,31 @@ +\@addtocsec{htoc}{1}{0}{\@print{1}\quad{}Introduction} +\@addtocsec{htoc}{2}{0}{\@print{2}\quad{}Examples} +\@addtocsec{htoc}{3}{1}{\@print{2.1}\quad{}Introduction to Grammars} +\@addtocsec{htoc}{4}{1}{\@print{2.2}\quad{}Lisp Expressions} +\@addtocsec{htoc}{5}{1}{\@print{2.3}\quad{}Calculator} +\@addtocsec{htoc}{6}{1}{\@print{2.4}\quad{}Calculator with Memory} +\@addtocsec{htoc}{7}{0}{\@print{3}\quad{}Grammars} +\@addtocsec{htoc}{8}{1}{\@print{3.1}\quad{}Left Factoring} +\newlabel{sec:Left-Factoring}{{3.1}{X}} +\@addtocsec{htoc}{9}{1}{\@print{3.2}\quad{}Left Recursion} +\@addtocsec{htoc}{10}{1}{\@print{3.3}\quad{}Ambiguous Grammars} +\newlabel{sec:Ambiguous-Grammars}{{3.3}{X}} +\@addtocsec{htoc}{11}{0}{\@print{4}\quad{}Customization} +\@addtocsec{htoc}{12}{1}{\@print{4.1}\quad{}Customizing Parsers} +\@addtocsec{htoc}{13}{1}{\@print{4.2}\quad{}Customizing Scanners} +\@addtocsec{htoc}{14}{0}{\@print{5}\quad{}Parser Mechanics} +\@addtocsec{htoc}{15}{1}{\@print{5.1}\quad{}Parser Objects} +\newlabel{sec:Parser-Objects}{{5.1}{X}} +\@addtocsec{htoc}{16}{1}{\@print{5.2}\quad{}Context Sensitive Scanner} +\@addtocsec{htoc}{17}{1}{\@print{5.3}\quad{}Internal Variables} +\@addtocsec{htoc}{18}{1}{\@print{5.4}\quad{}Pre- and Post-Parser Code} +\@addtocsec{htoc}{19}{1}{\@print{5.5}\quad{}Representation of Grammars} +\@addtocsec{htoc}{20}{0}{\@print{A}\quad{}Grammar for Parsers} +\@addtocsec{htoc}{21}{0}{\@print{B}\quad{}Upgrading} +\@addtocsec{htoc}{22}{0}{\@print{C}\quad{}Troubleshooting} +\@addtocsec{htoc}{23}{0}{\@print{D}\quad{}History} +\@addtocsec{htoc}{24}{0}{\@print{E}\quad{}Debian Extensions} +\newlabel{sec:debian}{{E}{X}} +\@addtocsec{htoc}{25}{0}{\@print{F}\quad{}Future Extensions} +\newlabel{sec:future}{{F}{X}} +\@addtocsec{htoc}{26}{0}{\@print{G}\quad{}References} diff --git a/doc/yapps2.html b/doc/yapps2.html new file mode 100644 index 0000000..f95be4d --- /dev/null +++ b/doc/yapps2.html @@ -0,0 +1,1206 @@ + + + + + + + + + + + + + + + + + + +
+
+
+ + + + + + + + + + + + +
The Yapps Parser Generator System
http://theory.stanford.edu/~amitp/Yapps/
Version 2
 
Amit J. Patel
http://www-cs-students.stanford.edu/ amitp/ +http://www-cs-students.stanford.edu/ amitp/

+

+
+ + +

1  Introduction

+ +Yapps (Yet Another Python +Parser System) is an easy to use parser +generator that is written in Python and generates Python code. There +are several parser generator systems already available for Python, +including PyLR, kjParsing, PyBison, and mcf.pars, +but I had different goals for my parser. Yapps is simple, is easy to +use, and produces human-readable parsers. It is not the fastest or +most powerful parser. Yapps is designed to be used when regular +expressions are not enough and other parser systems are too much: +situations where you may write your own recursive descent parser.
+
+Some unusual features of Yapps that may be of interest are: +
  1. Yapps produces recursive descent parsers that are readable by + humans, as opposed to table-driven parsers that are difficult to + read. A Yapps parser for a simple calculator looks similar to the + one that Mark Lutz wrote by hand for Programming Python.
    +
    +
  2. Yapps also allows for rules that accept parameters and pass + arguments to be used while parsing subexpressions. Grammars that + allow for arguments to be passed to subrules and for values to be + passed back are often called attribute grammars. In many + cases parameterized rules can be used to perform actions at “parse + time” that are usually delayed until later. For example, + information about variable declarations can be passed into the + rules that parse a procedure body, so that undefined variables can + be detected at parse time. The types of defined variables can be + used in parsing as well—for example, if the type of X is + known, we can determine whether X(1) is an array reference or + a function call.
    +
    +
  3. Yapps grammars are fairly easy to write, although there are + some inconveniences having to do with ELL(1) parsing that have to be + worked around. For example, rules have to be left factored and + rules may not be left recursive. However, neither limitation seems + to be a problem in practice.
    +
    +Yapps grammars look similar to the notation used in the Python + reference manual, with operators like *, +, |, + [], and () for patterns, names (tim) for rules, + regular expressions ("[a-z]+") for tokens, and # for + comments.
    +
    +
  4. The Yapps parser generator is written as a single Python module + with no C extensions. Yapps produces parsers that are written + entirely in Python, and require only the Yapps run-time module (5k) + for support.
    +
    +
  5. Yapps's scanner is context-sensitive, picking tokens based on + the types of the tokens accepted by the parser. This can be + helpful when implementing certain kinds of parsers, such as for a + preprocessor.
+There are several disadvantages of using Yapps over another parser system: +
  1. Yapps parsers are ELL(1) (Extended LL(1)), which is + less powerful than LALR (used by PyLR) or + SLR (used by kjParsing), so Yapps would not be a + good choice for parsing complex languages. For example, allowing + both x := 5; and x; as statements is difficult + because we must distinguish based on only one token of lookahead. + Seeing only x, we cannot decide whether we have an + assignment statement or an expression statement. (Note however + that this kind of grammar can be matched with backtracking; see + section F.)
    +
    +
  2. The scanner that Yapps provides can only read from strings, not + files, so an entire file has to be read in before scanning can + begin. It is possible to build a custom scanner, though, so in + cases where stream input is needed (from the console, a network, or + a large file are examples), the Yapps parser can be given a custom + scanner that reads from a stream instead of a string.
    +
    +
  3. Yapps is not designed with efficiency in mind.
+Yapps provides an easy to use parser generator that produces parsers +similar to what you might write by hand. It is not meant to be a +solution for all parsing problems, but instead an aid for those times +you would write a parser by hand rather than using one of the more +powerful parsing packages available.
+
+Yapps 2.0 is easier to use than Yapps 1.0. New features include a +less restrictive input syntax, which allows mixing of sequences, +choices, terminals, and nonterminals; optional matching; the ability +to insert single-line statements into the generated parser; and +looping constructs * and + similar to the repetitive +matching constructs in regular expressions. Unfortunately, the +addition of these constructs has made Yapps 2.0 incompatible with +Yapps 1.0, so grammars will have to be rewritten. See section +?? for tips on changing Yapps 1.0 grammars for use +with Yapps 2.0.
+
+ + +

2  Examples

+ +In this section are several examples that show the use of Yapps. +First, an introduction shows how to construct grammars and write them +in Yapps form. This example can be skipped by someone familiar with +grammars and parsing. Next is a Lisp expression grammar that produces +a parse tree as output. This example demonstrates the use of tokens +and rules, as well as returning values from rules. The third example +is a expression evaluation grammar that evaluates during parsing +(instead of producing a parse tree).
+
+ + +

2.1  Introduction to Grammars

+ +A grammar for a natural language specifies how words can be put +together to form large structures, such as phrases and sentences. A +grammar for a computer language is similar in that it specifies how +small components (called tokens) can be put together to form +larger structures. In this section we will write a grammar for a tiny +subset of English.
+
+Simple English sentences can be described as being a noun phrase +followed by a verb followed by a noun phrase. For example, in the +sentence, “Jack sank the blue ship,” the word “Jack” is the first +noun phrase, “sank” is the verb, and “the blue ship” is the second +noun phrase. In addition we should say what a noun phrase is; for +this example we shall say that a noun phrase is an optional article +(a, an, the) followed by any number of adjectives followed by a noun. +The tokens in our language are the articles, nouns, verbs, and +adjectives. The rules in our language will tell us how to +combine the tokens together to form lists of adjectives, noun phrases, +and sentences: + +Notice that some things that we said easily in English, such as +“optional article,” are expressed using special syntax, such as +brackets. When we said, “any number of adjectives,” we wrote +adjective*, where the * means “zero or more of the +preceding pattern”.
+
+The grammar given above is close to a Yapps grammar. We also have to +specify what the tokens are, and what to do when a pattern is matched. +For this example, we will do nothing when patterns are matched; the +next example will explain how to perform match actions. +
+parser TinyEnglish:
+  ignore:          "\\W+"
+  token noun:      "(Jack|spam|ship)"
+  token verb:      "(sank|threw)"
+  token article:   "(a|an|the)"
+  token adjective: "(blue|red|green)"
+
+  rule sentence:       noun_phrase verb noun_phrase
+  rule noun_phrase:    [article] adjective* noun
+
+The tokens are specified as Python regular expressions. Since +Yapps produces Python code, you can write any regular expression that +would be accepted by Python. (Note: These are Python 1.5 +regular expressions from the re module, not Python 1.4 +regular expressions from the regex module.) In addition to +tokens that you want to see (which are given names), you can also +specify tokens to ignore, marked by the ignore keyword. In +this parser we want to ignore whitespace.
+
+The TinyEnglish grammar shows how you define tokens and rules, but it +does not specify what should happen once we've matched the rules. In +the next example, we will take a grammar and produce a parse +tree from it.
+
+ + +

2.2  Lisp Expressions

+ +Lisp syntax, although hated by many, has a redeeming quality: it is +simple to parse. In this section we will construct a Yapps grammar to +parse Lisp expressions and produce a parse tree as output.
+
+ + +

Defining the Grammar

+ +The syntax of Lisp is simple. It has expressions, which are +identifiers, strings, numbers, and lists. A list is a left +parenthesis followed by some number of expressions (separated by +spaces) followed by a right parenthesis. For example, 5, +"ni", and (print "1+2 = " (+ 1 2)) are Lisp expressions. +Written as a grammar, +
+    expr:   ID | STR | NUM | list
+    list:   ( expr* )  
+
+In addition to having a grammar, we need to specify what to do every +time something is matched. For the tokens, which are strings, we just +want to get the “value” of the token, attach its type (identifier, +string, or number) in some way, and return it. For the lists, we want +to construct and return a Python list.
+
+Once some pattern is matched, we enclose a return statement enclosed +in {{...}}. The braces allow us to insert any one-line +statement into the parser. Within this statement, we can refer to the +values returned by matching each part of the rule. After matching a +token such as ID, “ID” will be bound to the text of the +matched token. Let's take a look at the rule: +
+    rule expr: ID   {{ return ('id', ID) }}
+      ...
+
+In a rule, tokens return the text that was matched. For identifiers, +we just return the identifier, along with a “tag” telling us that +this is an identifier and not a string or some other value. Sometimes +we may need to convert this text to a different form. For example, if +a string is matched, we want to remove quotes and handle special forms +like \n. If a number is matched, we want to convert it into a +number. Let's look at the return values for the other tokens: +
+      ...
+             | STR  {{ return ('str', eval(STR)) }}
+             | NUM  {{ return ('num', atoi(NUM)) }}
+      ...
+
+If we get a string, we want to remove the quotes and process any +special backslash codes, so we run eval on the quoted string. +If we get a number, we convert it to an integer with atoi and +then return the number along with its type tag.
+
+For matching a list, we need to do something slightly more +complicated. If we match a Lisp list of expressions, we want to +create a Python list with those values. +
+    rule list: "\\("                 # Match the opening parenthesis
+               {{ result = [] }}     # Create a Python list
+               ( 
+                  expr               # When we match an expression,
+                  {{ result.append(expr) }}   # add it to the list
+               )*                    # * means repeat this if needed
+               "\\)"                 # Match the closing parenthesis
+               {{ return result }}   # Return the Python list
+
+In this rule we first match the opening parenthesis, then go into a +loop. In this loop we match expressions and add them to the list. +When there are no more expressions to match, we match the closing +parenthesis and return the resulting. Note that # is used for +comments, just as in Python.
+
+The complete grammar is specified as follows: +
+parser Lisp:
+    ignore:      '\\s+'
+    token NUM:   '[0-9]+'
+    token ID:    '[-+*/!@%^&=.a-zA-Z0-9_]+' 
+    token STR:   '"([^\\"]+|\\\\.)*"'
+
+    rule expr:   ID     {{ return ('id', ID) }}
+               | STR    {{ return ('str', eval(STR)) }}
+               | NUM    {{ return ('num', atoi(NUM)) }}
+               | list   {{ return list }}
+    rule list: "\\("    {{ result = [] }} 
+               ( expr   {{ result.append(expr) }}
+               )*  
+               "\\)"    {{ return result }} 
+
+One thing you may have noticed is that "\\(" and "\\)" +appear in the list rule. These are inline tokens: +they appear in the rules without being given a name with the +token keyword. Inline tokens are more convenient to use, but +since they do not have a name, the text that is matched cannot be used +in the return value. They are best used for short simple patterns +(usually punctuation or keywords).
+
+Another thing to notice is that the number and identifier tokens +overlap. For example, “487” matches both NUM and ID. In Yapps, the +scanner only tries to match tokens that are acceptable to the parser. +This rule doesn't help here, since both NUM and ID can appear in the +same place in the grammar. There are two rules used to pick tokens if +more than one matches. One is that the longest match is +preferred. For example, “487x” will match as an ID (487x) rather +than as a NUM (487) followed by an ID (x). The second rule is that if +the two matches are the same length, the first one listed in +the grammar is preferred. For example, “487” will match as an NUM +rather than an ID because NUM is listed first in the grammar. Inline +tokens have preference over any tokens you have listed.
+
+Now that our grammar is defined, we can run Yapps to produce a parser, +and then run the parser to produce a parse tree.
+
+ + +

Running Yapps

+ +In the Yapps module is a function generate that takes an +input filename and writes a parser to another file. We can use this +function to generate the Lisp parser, which is assumed to be in +lisp.g. +
+% python
+Python 1.5.1 (#1, Sep  3 1998, 22:51:17)  [GCC 2.7.2.3] on linux-i386
+Copyright 1991-1995 Stichting Mathematisch Centrum, Amsterdam
+>>> import yapps
+>>> yapps.generate('lisp.g')
+
+At this point, Yapps has written a file lisp.py that contains +the parser. In that file are two classes (one scanner and one parser) +and a function (called parse) that puts things together for +you.
+
+Alternatively, we can run Yapps from the command line to generate the +parser file: +
+% python yapps.py lisp.g
+
+After running Yapps either from within Python or from the command +line, we can use the Lisp parser by calling the parse +function. The first parameter should be the rule we want to match, +and the second parameter should be the string to parse. +
+>>> import lisp
+>>> lisp.parse('expr', '(+ 3 4)')
+[('id', '+'), ('num', 3), ('num', 4)]
+>>> lisp.parse('expr', '(print "3 = " (+ 1 2))')
+[('id', 'print'), ('str', '3 = '), [('id', '+'), ('num', 1), ('num', 2)]]
+
+The parse function is not the only way to use the parser; +section 5.1 describes how to access parser objects +directly.
+
+We've now gone through the steps in creating a grammar, writing a +grammar file for Yapps, producing a parser, and using the parser. In +the next example we'll see how rules can take parameters and also how +to do computations instead of just returning a parse tree.
+
+ + +

2.3  Calculator

+ +A common example parser given in many textbooks is that for simple +expressions, with numbers, addition, subtraction, multiplication, +division, and parenthesization of subexpressions. We'll write this +example in Yapps, evaluating the expression as we parse.
+
+Unlike yacc, Yapps does not have any way to specify +precedence rules, so we have to do it ourselves. We say that an +expression is the sum of terms, and that a term is the product of +factors, and that a factor is a number or a parenthesized expression: +
+    expr:           factor ( ("+"|"-") factor )*
+    factor:         term   ( ("*"|"/") term )*
+    term:           NUM | "(" expr ")"
+
+In order to evaluate the expression as we go, we should keep along an +accumulator while evaluating the lists of terms or factors. Just as +we kept a “result” variable to build a parse tree for Lisp +expressions, we will use a variable to evaluate numerical +expressions. The full grammar is given below: +
+parser Calculator:
+    token END: "$"         # $ means end of string
+    token NUM: "[0-9]+"
+
+    rule goal:           expr END         {{ return expr }}
+
+    # An expression is the sum and difference of factors
+    rule expr:           factor           {{ v = factor }}
+                       ( "[+]" factor       {{ v = v+factor }}
+                       |  "-"  factor       {{ v = v-factor }}
+                       )*                 {{ return v }}
+
+    # A factor is the product and division of terms
+    rule factor:         term             {{ v = term }}
+                       ( "[*]" term         {{ v = v*term }}
+                       |  "/"  term         {{ v = v/term }}
+                       )*                 {{ return v }}
+
+    # A term is either a number or an expression surrounded by parentheses
+    rule term:           NUM              {{ return atoi(NUM) }}
+                       | "\\(" expr "\\)" {{ return expr }}
+
+The top-level rule is goal, which says that we are looking for +an expression followed by the end of the string. The END +token is needed because without it, it isn't clear when to stop +parsing. For example, the string “1+3” could be parsed either as +the expression “1” followed by the string “+3” or it could be +parsed as the expression “1+3”. By requiring expressions to end +with END, the parser is forced to take “1+3”.
+
+In the two rules with repetition, the accumulator is named v. +After reading in one expression, we initialize the accumulator. Each +time through the loop, we modify the accumulator by adding, +subtracting, multiplying by, or dividing the previous accumulator by +the expression that has been parsed. At the end of the rule, we +return the accumulator.
+
+The calculator example shows how to process lists of elements using +loops, as well as how to handle precedence of operators.
+
+Note: It's often important to put the END token in, so +put it in unless you are sure that your grammar has some other +non-ambiguous token marking the end of the program.
+
+ + +

2.4  Calculator with Memory

+ +In the previous example we learned how to write a calculator that +evaluates simple numerical expressions. In this section we will +extend the example to support both local and global variables.
+
+To support global variables, we will add assignment statements to the +“goal” rule. +
+    rule goal:           expr END         {{ return expr }}
+              | 'set' ID expr END         {{ global_vars[ID] = expr }}
+                                          {{ return expr }}   
+
+To use these variables, we need a new kind of terminal: +
+    rule term: ... | ID {{ return global_vars[ID] }} 
+
+So far, these changes are straightforward. We simply have a global +dictionary global_vars that stores the variables and values, +we modify it when there is an assignment statement, and we look up +variables in it when we see a variable name.
+
+To support local variables, we will add variable declarations to the +set of allowed expressions. +
+    rule term: ... | 'let' VAR '=' expr 'in' expr ...
+
+This is where it becomes tricky. Local variables should be stored in +a local dictionary, not in the global one. One trick would be to save +a copy of the global dictionary, modify it, and then restore it +later. In this example we will instead use attributes to +create local information and pass it to subrules.
+
+A rule can optionally take parameters. When we invoke the rule, we +must pass in arguments. For local variables, let's use a single +parameter, local_vars: +
+    rule expr<<local_vars>>:   ...
+    rule factor<<local_vars>>: ...
+    rule term<<local_vars>>:   ...
+
+Each time we want to match expr, factor, or +term, we will pass the local variables in the current rule to +the subrule. One interesting case is when we pass as an argument +something other than local_vars: +
+   rule term<<local_vars>>: ...
+                | 'let' VAR '=' expr<<local_vars>>
+                  {{ local_vars = [(VAR, expr)] + local_vars }}
+                  'in' expr<<local_vars>>
+                  {{ return expr }}
+
+Note that the assignment to the local variables list does not modify +the original list. This is important to keep local variables from +being seen outside the “let”.
+
+The other interesting case is when we find a variable: +
+global_vars = {}
+
+def lookup(map, name):
+    for x,v in map:  if x==name: return v
+    return global_vars[name]
+%%
+   ...
+   rule term<<local_vars>: ...
+                | VAR {{ return lookup(local_vars, VAR) }}
+
+The lookup function will search through the local variable list, and +if it cannot find the name there, it will look it up in the global +variable dictionary.
+
+A complete grammar for this example, including a read-eval-print loop +for interacting with the calculator, can be found in the examples +subdirectory included with Yapps.
+
+In this section we saw how to insert code before the parser. We also +saw how to use attributes to transmit local information from one rule +to its subrules.
+
+ + +

3  Grammars

+ +Each Yapps grammar has a name, a list of tokens, and a set of +production rules. A grammar named X will be used to produce +a parser named X and a scanner anmed XScanner. As +in Python, names are case sensitive, start with a letter, and contain +letters, numbers, and underscores (_).
+
+There are three kinds of tokens in Yapps: named, inline, and ignored. +As their name implies, named tokens are given a name, using the token +construct: token name : regexp. In a rule, the +token can be matched by using the name. Inline tokens are regular +expressions that are used in rules without being declared. Ignored +tokens are declared using the ignore construct: ignore: + regexp. These tokens are ignored by the scanner, and are +not seen by the parser. Often whitespace is an ignored token. The +regular expressions used to define tokens should use the syntax +defined in the re module, so some symbols may have to be +backslashed.
+
+Production rules in Yapps have a name and a pattern to match. If the +rule is parameterized, the name should be followed by a list of +parameter names in <<...>>. A pattern can be a simple pattern +or a compound pattern. Simple patterns are the name of a named token, +a regular expression in quotes (inline token), the name of a +production rule (followed by arguments in <<...>>, if the rule +has parameters), and single line Python statements ({{...}}). +Compound patterns are sequences (A B C ...), choices ( +A | B | C | ...), options ([...]), zero-or-more repetitions +(...*), and one-or-more repetitions (...+). Like +regular expressions, repetition operators have a higher precedence +than sequences, and sequences have a higher precedence than choices.
+
+Whenever {{...}} is used, a legal one-line Python statement +should be put inside the braces. The token }} should not +appear within the {{...}} section, even within a string, since +Yapps does not attempt to parse the Python statement. A workaround +for strings is to put two strings together ("}" "}"), or to use +backslashes ("}\}"). At the end of a rule you should use a +{{ return X }} statement to return a value. However, you +should not use any control statements (return, +continue, break) in the middle of a rule. Yapps +needs to make assumptions about the control flow to generate a parser, +and any changes to the control flow will confuse Yapps.
+
+The <<...>> form can occur in two places: to define parameters +to a rule and to give arguments when matching a rule. Parameters use +the syntax used for Python functions, so they can include default +arguments and the special forms (*args and **kwargs). +Arguments use the syntax for Python function call arguments, so they +can include normal arguments and keyword arguments. The token +>> should not appear within the <<...>> section.
+
+In both the statements and rule arguments, you can use names defined +by the parser to refer to matched patterns. You can refer to the text +matched by a named token by using the token name. You can use the +value returned by a production rule by using the name of that rule. +If a name X is matched more than once (such as in loops), you +will have to save the earlier value(s) in a temporary variable, and +then use that temporary variable in the return value. The next +section has an example of a name that occurs more than once.
+
+ + +

3.1  Left Factoring

+ + +Yapps produces ELL(1) parsers, which determine which clause to match +based on the first token available. Sometimes the leftmost tokens of +several clauses may be the same. The classic example is the +if/then/else construct in Pascal: +
+rule stmt:  "if" expr "then" stmt {{ then_part = stmt }} 
+                      "else" stmt {{ return ('If',expr,then_part,stmt) }}
+          | "if" expr "then" stmt {{ return ('If',expr,stmt,[]) }}
+
+(Note that we have to save the first stmt into a variable +because there is another stmt that will be matched.) The +left portions of the two clauses are the same, which presents a +problem for the parser. The solution is left-factoring: the +common parts are put together, and then a choice is made about +the remaining part: +
+rule stmt:  "if" expr 
+              "then" stmt {{ then_part = stmt }}
+              {{ else_part = [] }}
+              [ "else" stmt {{ else_part = stmt }} ]
+            {{ return ('If', expr, then_part, else_part) }}
+
+Unfortunately, the classic if/then/else situation is +still ambiguous when you left-factor. Yapps can deal with this +situation, but will report a warning; see section +3.3 for details.
+
+In general, replace rules of the form: +
+rule A:   a b1 {{ return E1 }}
+        | a b2 {{ return E2 }}
+        | c3   {{ return E3 }}
+        | c4   {{ return E4 }}
+
+with rules of the form: +
+rule A:   a ( b1 {{ return E1 }}
+            | b2 {{ return E2 }}
+            )
+        | c3   {{ return E3 }}
+        | c4   {{ return E4 }}
+
+ + +

3.2  Left Recursion

+ +A common construct in grammars is for matching a list of patterns, +sometimes separated with delimiters such as commas or semicolons. In +LR-based parser systems, we can parse a list with something like this: +
+rule sum:  NUM             {{ return NUM }}
+         | sum "+" NUM     {{ return (sum, NUM) }}
+
+Parsing 1+2+3+4 would produce the output +(((1,2),3),4), which is what we want from a left-associative +addition operator. Unfortunately, this grammar is left +recursive, because the sum rule contains a clause that +begins with sum. (The recursion occurs at the left side of +the clause.)
+
+We must restructure this grammar to be right recursive instead: +
+rule sum:  NUM             {{ return NUM }}
+         | NUM "+" sum     {{ return (NUM, sum) }}
+
+Unfortunately, using this grammar, 1+2+3+4 would be parsed as +(1,(2,(3,4))), which no longer follows left associativity. +The rule also needs to be left-factored. Instead, we write the +pattern as a loop instead: +
+rule sum:       NUM {{ v = NUM }}
+                ( "[+]" NUM {{ v = (v,NUM) }} )*
+                {{ return v }}
+
+In general, replace rules of the form: +
+rule A:  A a1 -> << E1 >> 
+       | A a2 -> << E2 >>
+       | b3   -> << E3 >>
+       | b4   -> << E4 >>
+
+with rules of the form: +
+rule A:  ( b3 {{ A = E3 }} 
+         | b4 {{ A = E4 }} )
+         ( a1 {{ A = E1 }}
+         | a2 {{ A = E2 }} )*
+         {{ return A }}
+
+We have taken a rule that proved problematic for with recursion and +turned it into a rule that works well with looping constructs.
+
+ + +

3.3  Ambiguous Grammars

+ + +In section 3.1 we saw the classic if/then/else +ambiguity, which occurs because the “else ...” portion of an “if +...then ...else ...” construct is optional. Programs with +nested if/then/else constructs can be ambiguous when one of the else +clauses is missing: +
+if 1 then            if 1 then
+    if 5 then            if 5 then
+        x := 1;              x := 1;
+    else             else
+        y := 9;          y := 9;
+
+The indentation shows that the program can be parsed in two different +ways. (Of course, if we all would adopt Python's indentation-based +structuring, this would never happen!) Usually we want the parsing on +the left: the “else” should be associated with the closest “if” +statement. In section 3.1 we “solved” the +problem by using the following grammar: +
+rule stmt:  "if" expr 
+              "then" stmt {{ then_part = stmt }}
+              {{ else_part = [] }}
+              [ "else" stmt {{ else_part = stmt }} ]
+            {{ return ('If', expr, then_part, else_part) }}
+
+Here, we have an optional match of “else” followed by a statement. +The ambiguity is that if an “else” is present, it is not clear +whether you want it parsed immediately or if you want it to be parsed +by the outer “if”.
+
+Yapps will deal with the situation by matching when the else pattern +when it can. The parser will work in this case because it prefers the +first matching clause, which tells Yapps to parse the “else”. +That is exactly what we want!
+
+For ambiguity cases with choices, Yapps will choose the first +matching choice. However, remember that Yapps only looks at the first +token to determine its decision, so (a b | a c) will result in +Yapps choosing a b even when the input is a c. It only +looks at the first token, a, to make its decision.
+
+ + +

4  Customization

+ +Both the parsers and the scanners can be customized. The parser is +usually extended by subclassing, and the scanner can either be +subclassed or completely replaced.
+
+ + +

4.1  Customizing Parsers

+ +If additional fields and methods are needed in order for a parser to +work, Python subclassing can be used. (This is unlike parser classes +written in static languages, in which these fields and methods must be +defined in the generated parser class.) We simply subclass the +generated parser, and add any fields or methods required. Expressions +in the grammar can call methods of the subclass to perform any actions +that cannot be expressed as a simple expression. For example, +consider this simple grammar: +
+parser X:
+    rule goal:  "something"  {{ self.printmsg() }}
+
+The printmsg function need not be implemented in the parser +class X; it can be implemented in a subclass: +
+import Xparser
+
+class MyX(Xparser.X):
+    def printmsg(self):
+        print "Hello!"
+
+ + +

4.2  Customizing Scanners

+ +The generated parser class is not dependent on the generated scanner +class. A scanner object is passed to the parser object's constructor +in the parse function. To use a different scanner, write +your own function to construct parser objects, with an instance of a +different scanner. Scanner objects must have a token method +that accepts an integer N as well as a list of allowed token +types, and returns the Nth token, as a tuple. The default scanner +raises NoMoreTokens if no tokens are available, and +SyntaxError if no token could be matched. However, the +parser does not rely on these exceptions; only the parse +convenience function (which calls wrap_error_reporter) and +the print_error error display function use those exceptions.
+
+The tuples representing tokens have four elements. The first two are +the beginning and ending indices of the matched text in the input +string. The third element is the type tag, matching either the name +of a named token or the quoted regexp of an inline or ignored token. +The fourth element of the token tuple is the matched text. If the +input string is s, and the token tuple is +(b,e,type,val), then val should be equal to +s[b:e].
+
+The generated parsers do not the beginning or ending index. They use +only the token type and value. However, the default error reporter +uses the beginning and ending index to show the user where the error +is.
+
+ + +

5  Parser Mechanics

+ +The base parser class (Parser) defines two methods, _scan +and _peek, and two fields, _pos and +_scanner. The generated parser inherits from the base +parser, and contains one method for each rule in the grammar. To +avoid name clashes, do not use names that begin with an underscore +(_).
+
+ + +

5.1  Parser Objects

+ + +Yapps produces as output two exception classes, a scanner class, a +parser class, and a function parse that puts everything +together. The parse function does not have to be used; +instead, one can create a parser and scanner object and use them +together for parsing. +
+    def parse(rule, text):
+        P = X(XScanner(text))
+        return wrap_error_reporter(P, rule)
+
+The parse function takes a name of a rule and an input string +as input. It creates a scanner and parser object, then calls +wrap_error_reporter to execute the method in the parser +object named rule. The wrapper function will call the +appropriate parser rule and report any parsing errors to standard +output.
+
+There are several situations in which the parse function +would not be useful. If a different parser or scanner is being used, +or exceptions are to be handled differently, a new parse +function would be required. The supplied parse function can +be used as a template for writing a function for your own needs. An +example of a custom parse function is the generate function +in Yapps.py.
+
+ + +

5.2  Context Sensitive Scanner

+ +Unlike most scanners, the scanner produced by Yapps can take into +account the context in which tokens are needed, and try to match only +good tokens. For example, in the grammar: +
+parser IniFile:
+   token ID:   "[a-zA-Z_0-9]+"
+   token VAL:  ".*"
+
+   rule pair:  ID "[ \t]*=[ \t]*" VAL "\n"
+
+we would like to scan lines of text and pick out a name/value pair. +In a conventional scanner, the input string shell=progman.exe +would be turned into a single token of type VAL. The Yapps +scanner, however, knows that at the beginning of the line, an +ID is expected, so it will return "shell" as a token +of type ID. Later, it will return "progman.exe" as +a token of type VAL.
+
+Context sensitivity decreases the separation between scanner and +parser, but it is useful in parsers like IniFile, where the +tokens themselves are not unambiguous, but are unambiguous +given a particular stage in the parsing process.
+
+Unfortunately, context sensitivity can make it more difficult to +detect errors in the input. For example, in parsing a Pascal-like +language with “begin” and “end” as keywords, a context sensitive +scanner would only match “end” as the END token if the parser is in +a place that will accept the END token. If not, then the scanner +would match “end” as an identifier. To disable the context +sensitive scanner in Yapps, add the +context-insensitive-scanner option to the grammar: +
+Parser X:
+    option:  "context-insensitive-scanner"
+
+Context-insensitive scanning makes the parser look cleaner as well.
+
+ + +

5.3  Internal Variables

+ +There are two internal fields that may be of use. The parser object +has two fields, _pos, which is the index of the current +token being matched, and _scanner, which is the scanner +object. The token itself can be retrieved by accessing the scanner +object and calling the token method with the token index. However, if you call token before the token has been requested by the parser, it may mess up a context-sensitive scanner.1 A +potentially useful combination of these fields is to extract the +portion of the input matched by the current rule. To do this, just save the scanner state (_scanner.pos) before the text is matched and then again after the text is matched: +
+  rule R: 
+      {{ start = self._scanner.pos }}
+      a b c 
+      {{ end = self._scanner.pos }}
+      {{ print 'Text is', self._scanner.input[start:end] }}
+
+ + +

5.4  Pre- and Post-Parser Code

+ +Sometimes the parser code needs to rely on helper variables, +functions, and classes. A Yapps grammar can optionally be surrounded +by double percent signs, to separate the grammar from Python code. +
+... Python code ...
+%%
+... Yapps grammar ...
+%%
+... Python code ...
+
+The second %% can be omitted if there is no Python code at the +end, and the first %% can be omitted if there is no extra +Python code at all. (To have code only at the end, both separators +are required.)
+
+If the second %% is omitted, Yapps will insert testing code +that allows you to use the generated parser to parse a file.
+
+The extended calculator example in the Yapps examples subdirectory +includes both pre-parser and post-parser code.
+
+ + +

5.5  Representation of Grammars

+ +For each kind of pattern there is a class derived from Pattern. Yapps +has classes for Terminal, NonTerminal, Sequence, Choice, Option, Plus, +Star, and Eval. Each of these classes has the following interface: + +Both setup and update monotonically increase the +variables they modify. Since the variables can only increase a finite +number of times, we can repeatedly call the function until the +variable stabilized. The used function is not currently +implemented.
+
+With each pattern in the grammar Yapps associates three pieces of +information: the firstset, the followset, and the +accepts-є flag.
+
+The firstset contains the tokens that can appear as we start +matching the pattern. The followset contains the tokens that can +appear immediately after we match the pattern. The accepts-є +flag is true if the pattern can match no tokens. In this case, firstwill contain all the elements in follow. The followset is not +needed when accepts-є is false, and may not be accurate in +those cases.
+
+Yapps does not compute these sets precisely. Its approximation can +miss certain cases, such as this one: +
+  rule C: ( A* | B )
+  rule B: C [A]
+
+Yapps will calculate C's followset to include A. +However, C will always match all the A's, so A will +never follow it. Yapps 2.0 does not properly handle this construct, +but if it seems important, I may add support for it in a future +version.
+
+Yapps also cannot handle constructs that depend on the calling +sequence. For example: +
+  rule R: U | 'b'
+  rule S:   | 'c'
+  rule T: S 'b'
+  rule U: S 'a'
+
+The followset for S includes a and b. Since S can be empty, the firstset for S should include a, +b, and c. However, when parsing R, if the lookahead +is b we should not parse U. That's because in U, S is followed by a and not b. Therefore in +R, we should choose rule U only if there is an a or +c, but not if there is a b. Yapps and many other LL(1) +systems do not distinguish S b and S a, making S's followset a, b, and making R always try to match +U. In this case we can solve the problem by changing R to +'b' | U but it may not always be possible to solve all such +problems in this way.
+
+ + +

A  Grammar for Parsers

+ +This is the grammar for parsers, without any Python code mixed in. +The complete grammar can be found in parsedesc.g in the Yapps +distribution. +
+parser ParserDescription:
+    ignore:      "\\s+"
+    ignore:      "#.*?\r?\n"
+    token END:   "$"  # $ means end of string
+    token ATTR:  "<<.+?>>"
+    token STMT:  "{{.+?}}"
+    token ID:    '[a-zA-Z_][a-zA-Z_0-9]*'
+    token STR:   '[rR]?\'([^\\n\'\\\\]|\\\\.)*\'|[rR]?"([^\\n"\\\\]|\\\\.)*"'
+
+    rule Parser: "parser" ID ":"
+                   Options
+                   Tokens
+                   Rules
+                 END 
+
+    rule Options:  ( "option" ":" STR )*
+    rule Tokens:   ( "token" ID ":" STR | "ignore"   ":" STR )*
+    rule Rules:    ( "rule" ID OptParam ":" ClauseA )*
+
+    rule ClauseA:  ClauseB ( '[|]' ClauseB )*
+    rule ClauseB:  ClauseC*
+    rule ClauseC:  ClauseD [ '[+]' | '[*]' ]
+    rule ClauseD:  STR | ID [ATTR] | STMT
+                 | '\\(' ClauseA '\\) | '\\[' ClauseA '\\]'
+
+ + +

B  Upgrading

+ +Yapps 2.0 is not backwards compatible with Yapps 1.0. In this section +are some tips for upgrading: +
  1. + Yapps 1.0 was distributed as a single file. Yapps 2.0 is + instead distributed as two Python files: a parser generator + (26k) and a parser runtime (5k). You need both files to + create parsers, but you need only the runtime (yappsrt.py) + to use the parsers.
    +
    +
  2. Yapps 1.0 supported Python 1.4 regular expressions from the + regex module. Yapps 2.0 uses Python 1.5 regular + expressions from the re module. The new syntax for + regular expressions is not compatible with the old syntax. + Andrew Kuchling has a guide to converting + regular + expressionshttp://www.python.org/doc/howto/regex-to-re/ on his + web page.
    +
    +
  3. Yapps 1.0 wants a pattern and then a return value in -> + <<...>>. Yapps 2.0 allows patterns and Python statements to + be mixed. To convert a rule like this: +
    +rule R: A B C -> << E1 >>
    +      | X Y Z -> << E2 >>
    +
    to Yapps 2.0 form, replace the return value specifiers with return + statements: +
    +rule R: A B C {{ return E1 }}
    +      | X Y Z {{ return E2 }}
    +
  4. Yapps 2.0 does not perform tail recursion elimination. This + means any recursive rules you write will be turned into recursive + methods in the parser. The parser will work, but may be slower. + It can be made faster by rewriting recursive rules, using instead + the looping operators * and + provided in Yapps 2.0.
+ + +

C  Troubleshooting

+ + + + +

D  History

+ +Yapps 1 had several limitations that bothered me while writing +parsers: +
  1. + It was not possible to insert statements into the generated + parser. A common workaround was to write an auxilliary function + that executed those statements, and to call that function as part + of the return value calculation. For example, several of my + parsers had an “append(x,y)” function that existed solely to call + “x.append(y)”. +
  2. The way in which grammars were specified was rather + restrictive: a rule was a choice of clauses. Each clause was a + sequence of tokens and rule names, followed by a return value. +
  3. Optional matching had to be put into a separate rule because + choices were only made at the beginning of a rule. +
  4. Repetition had to be specified in terms of recursion. Not only + was this awkward (sometimes requiring additional rules), I had to + add a tail recursion optimization to Yapps to transform the + recursion back into a loop. +
+Yapps 2 addresses each of these limitations. +
  1. + Statements can occur anywhere within a rule. (However, only + one-line statements are allowed; multiline blocks marked by + indentation are not.) +
  2. Grammars can be specified using any mix of sequences, choices, + tokens, and rule names. To allow for complex structures, + parentheses can be used for grouping. +
  3. Given choices and parenthesization, optional matching can be + expressed as a choice between some pattern and nothing. In + addition, Yapps 2 has the convenience syntax [A B ...] for + matching A B ... optionally. +
  4. Repetition operators * for zero or more and + for + one or more make it easy to specify repeating patterns. +
+It is my hope that Yapps 2 will be flexible enough to meet my needs +for another year, yet simple enough that I do not hesitate to use it.
+
+ + +

E  Debian Extensions

+ + +The Debian version adds the following enhancements to the original +Yapps code. They were written by Matthias Urlichs. +
  1. + Yapps can stack input sources ("include files"). A usage example + is supplied with the calc.g sample program. +
  2. Yapps now understands augmented ignore-able patterns. + This means that Yapps can parse multi-line C comments; this wasn't + possible before. +
  3. Better error reporting. +
  4. Yapps now reads its input incrementally. +
+The generated parser has been renamed to yapps/runtime.py. +In Debian, this file is provided by the yapps2-runtime package. +You need to depend on it if you Debianize Python programs which use +yapps.
+
+ + +

F  Future Extensions

+ + +I am still investigating the possibility of LL(2) and higher +lookahead. However, it looks like the resulting parsers will be +somewhat ugly.
+
+It would be nice to control choices with user-defined predicates.
+
+The most likely future extension is backtracking. A grammar pattern +like (VAR ':=' expr)? {{ return Assign(VAR,expr) }} : expr {{ return expr }} +would turn into code that attempted to match VAR ':=' expr. If +it succeeded, it would run {{ return ... }}. If it failed, it +would match expr {{ return expr }}. Backtracking may make it +less necessary to write LL(2) grammars.
+
+ + +

G  References

+ +
  1. + The Python-Parser + SIGhttp://www.python.org/sigs/parser-sig/ is the first place + to look for a list of parser systems for Python.
    +
    +
  2. ANTLR/PCCTS, by Terrence Parr, is available at + The ANTLR Home Pagehttp://www.antlr.org/.
    +
    +
  3. PyLR, by Scott Cotton, is at his Starship + pagehttp://starship.skyport.net/crew/scott/PyLR.html.
    +
    +
  4. John Aycock's Compiling Little Languages + Frameworkhttp://www.foretec.com/python/workshops/1998-11/proceedings/papers/aycock-little/aycock-little.html.
    +
    +
  5. PyBison, by Scott Hassan, can be found at + his Python Projects + pagehttp://coho.stanford.edu/~hassan/Python/.
    +
    +
  6. mcf.pars, by Mike C. Fletcher, is available at + his web + pagehttp://members.rogers.com/mcfletch/programming/simpleparse/simpleparse.html.
    +
    +
  7. kwParsing, by Aaron Watters, is available at + his Starship + pagehttp://starship.skyport.net/crew/aaron_watters/kwParsing/. +
+ +
1
When using a context-sensitive scanner, the parser tells the scanner what the valid token types are at each point. If you call token before the parser can tell the scanner the valid token types, the scanner will attempt to match without considering the context. +
+ + + + +
This document was translated from LATEX by +HEVEA.
+ diff --git a/doc/yapps2.htoc b/doc/yapps2.htoc new file mode 100644 index 0000000..9fabbc6 --- /dev/null +++ b/doc/yapps2.htoc @@ -0,0 +1,36 @@ +\begin{tocenv} +\tocitem \@locref{htoc1}{\begin{@norefs}\@print{1}\quad{}Introduction\end{@norefs}} +\tocitem \@locref{htoc2}{\begin{@norefs}\@print{2}\quad{}Examples\end{@norefs}} +\begin{tocenv} +\tocitem \@locref{htoc3}{\begin{@norefs}\@print{2.1}\quad{}Introduction to Grammars\end{@norefs}} +\tocitem \@locref{htoc4}{\begin{@norefs}\@print{2.2}\quad{}Lisp Expressions\end{@norefs}} +\tocitem \@locref{htoc5}{\begin{@norefs}\@print{2.3}\quad{}Calculator\end{@norefs}} +\tocitem \@locref{htoc6}{\begin{@norefs}\@print{2.4}\quad{}Calculator with Memory\end{@norefs}} +\end{tocenv} +\tocitem \@locref{htoc7}{\begin{@norefs}\@print{3}\quad{}Grammars\end{@norefs}} +\begin{tocenv} +\tocitem \@locref{htoc8}{\begin{@norefs}\@print{3.1}\quad{}Left Factoring\end{@norefs}} +\tocitem \@locref{htoc9}{\begin{@norefs}\@print{3.2}\quad{}Left Recursion\end{@norefs}} +\tocitem \@locref{htoc10}{\begin{@norefs}\@print{3.3}\quad{}Ambiguous Grammars\end{@norefs}} +\end{tocenv} +\tocitem \@locref{htoc11}{\begin{@norefs}\@print{4}\quad{}Customization\end{@norefs}} +\begin{tocenv} +\tocitem \@locref{htoc12}{\begin{@norefs}\@print{4.1}\quad{}Customizing Parsers\end{@norefs}} +\tocitem \@locref{htoc13}{\begin{@norefs}\@print{4.2}\quad{}Customizing Scanners\end{@norefs}} +\end{tocenv} +\tocitem \@locref{htoc14}{\begin{@norefs}\@print{5}\quad{}Parser Mechanics\end{@norefs}} +\begin{tocenv} +\tocitem \@locref{htoc15}{\begin{@norefs}\@print{5.1}\quad{}Parser Objects\end{@norefs}} +\tocitem \@locref{htoc16}{\begin{@norefs}\@print{5.2}\quad{}Context Sensitive Scanner\end{@norefs}} +\tocitem \@locref{htoc17}{\begin{@norefs}\@print{5.3}\quad{}Internal Variables\end{@norefs}} +\tocitem \@locref{htoc18}{\begin{@norefs}\@print{5.4}\quad{}Pre- and Post-Parser Code\end{@norefs}} +\tocitem \@locref{htoc19}{\begin{@norefs}\@print{5.5}\quad{}Representation of Grammars\end{@norefs}} +\end{tocenv} +\tocitem \@locref{htoc20}{\begin{@norefs}\@print{A}\quad{}Grammar for Parsers\end{@norefs}} +\tocitem \@locref{htoc21}{\begin{@norefs}\@print{B}\quad{}Upgrading\end{@norefs}} +\tocitem \@locref{htoc22}{\begin{@norefs}\@print{C}\quad{}Troubleshooting\end{@norefs}} +\tocitem \@locref{htoc23}{\begin{@norefs}\@print{D}\quad{}History\end{@norefs}} +\tocitem \@locref{htoc24}{\begin{@norefs}\@print{E}\quad{}Debian Extensions\end{@norefs}} +\tocitem \@locref{htoc25}{\begin{@norefs}\@print{F}\quad{}Future Extensions\end{@norefs}} +\tocitem \@locref{htoc26}{\begin{@norefs}\@print{G}\quad{}References\end{@norefs}} +\end{tocenv} diff --git a/doc/yapps2.tex b/doc/yapps2.tex new file mode 100644 index 0000000..31b44be --- /dev/null +++ b/doc/yapps2.tex @@ -0,0 +1,1246 @@ +\documentclass[10pt]{article} +\usepackage{palatino} +\usepackage{html} +\usepackage{color} + +\setlength{\headsep}{0in} +\setlength{\headheight}{0in} +\setlength{\textheight}{8.5in} +\setlength{\textwidth}{5.9in} +\setlength{\oddsidemargin}{0.25in} + +\definecolor{darkblue}{rgb}{0,0,0.6} +\definecolor{darkerblue}{rgb}{0,0,0.3} + +%% \newcommand{\mysection}[1]{\section{\textcolor{darkblue}{#1}}} +%% \newcommand{\mysubsection}[1]{\subsection{\textcolor{darkerblue}{#1}}} +\newcommand{\mysection}[1]{\section{#1}} +\newcommand{\mysubsection}[1]{\subsection{#1}} + +\bodytext{bgcolor=white text=black link=#004080 vlink=#006020} + +\newcommand{\first}{\textsc{first}} +\newcommand{\follow}{\textsc{follow}} + +\begin{document} + +\begin{center} +\hfill \begin{tabular}{c} +{\Large The \emph{Yapps} Parser Generator System}\\ +\verb|http://theory.stanford.edu/~amitp/Yapps/|\\ + Version 2\\ +\\ +Amit J. Patel\\ +\htmladdnormallink{http://www-cs-students.stanford.edu/~amitp/} +{http://www-cs-students.stanford.edu/~amitp/} + +\end{tabular} \hfill \rule{0in}{0in} +\end{center} + +\mysection{Introduction} + +\emph{Yapps} (\underline{Y}et \underline{A}nother \underline{P}ython +\underline{P}arser \underline{S}ystem) is an easy to use parser +generator that is written in Python and generates Python code. There +are several parser generator systems already available for Python, +including \texttt{PyLR, kjParsing, PyBison,} and \texttt{mcf.pars,} +but I had different goals for my parser. Yapps is simple, is easy to +use, and produces human-readable parsers. It is not the fastest or +most powerful parser. Yapps is designed to be used when regular +expressions are not enough and other parser systems are too much: +situations where you may write your own recursive descent parser. + +Some unusual features of Yapps that may be of interest are: + +\begin{enumerate} + + \item Yapps produces recursive descent parsers that are readable by + humans, as opposed to table-driven parsers that are difficult to + read. A Yapps parser for a simple calculator looks similar to the + one that Mark Lutz wrote by hand for \emph{Programming Python.} + + \item Yapps also allows for rules that accept parameters and pass + arguments to be used while parsing subexpressions. Grammars that + allow for arguments to be passed to subrules and for values to be + passed back are often called \emph{attribute grammars.} In many + cases parameterized rules can be used to perform actions at ``parse + time'' that are usually delayed until later. For example, + information about variable declarations can be passed into the + rules that parse a procedure body, so that undefined variables can + be detected at parse time. The types of defined variables can be + used in parsing as well---for example, if the type of {\tt X} is + known, we can determine whether {\tt X(1)} is an array reference or + a function call. + + \item Yapps grammars are fairly easy to write, although there are + some inconveniences having to do with ELL(1) parsing that have to be + worked around. For example, rules have to be left factored and + rules may not be left recursive. However, neither limitation seems + to be a problem in practice. + + Yapps grammars look similar to the notation used in the Python + reference manual, with operators like \verb:*:, \verb:+:, \verb:|:, + \verb:[]:, and \verb:(): for patterns, names ({\tt tim}) for rules, + regular expressions (\verb:"[a-z]+":) for tokens, and \verb:#: for + comments. + + \item The Yapps parser generator is written as a single Python module + with no C extensions. Yapps produces parsers that are written + entirely in Python, and require only the Yapps run-time module (5k) + for support. + + \item Yapps's scanner is context-sensitive, picking tokens based on + the types of the tokens accepted by the parser. This can be + helpful when implementing certain kinds of parsers, such as for a + preprocessor. + +\end{enumerate} + +There are several disadvantages of using Yapps over another parser system: + +\begin{enumerate} + + \item Yapps parsers are \texttt{ELL(1)} (Extended LL(1)), which is + less powerful than \texttt{LALR} (used by \texttt{PyLR}) or + \texttt{SLR} (used by \texttt{kjParsing}), so Yapps would not be a + good choice for parsing complex languages. For example, allowing + both \texttt{x := 5;} and \texttt{x;} as statements is difficult + because we must distinguish based on only one token of lookahead. + Seeing only \texttt{x}, we cannot decide whether we have an + assignment statement or an expression statement. (Note however + that this kind of grammar can be matched with backtracking; see + section \ref{sec:future}.) + + \item The scanner that Yapps provides can only read from strings, not + files, so an entire file has to be read in before scanning can + begin. It is possible to build a custom scanner, though, so in + cases where stream input is needed (from the console, a network, or + a large file are examples), the Yapps parser can be given a custom + scanner that reads from a stream instead of a string. + + \item Yapps is not designed with efficiency in mind. + +\end{enumerate} + +Yapps provides an easy to use parser generator that produces parsers +similar to what you might write by hand. It is not meant to be a +solution for all parsing problems, but instead an aid for those times +you would write a parser by hand rather than using one of the more +powerful parsing packages available. + +Yapps 2.0 is easier to use than Yapps 1.0. New features include a +less restrictive input syntax, which allows mixing of sequences, +choices, terminals, and nonterminals; optional matching; the ability +to insert single-line statements into the generated parser; and +looping constructs \verb|*| and \verb|+| similar to the repetitive +matching constructs in regular expressions. Unfortunately, the +addition of these constructs has made Yapps 2.0 incompatible with +Yapps 1.0, so grammars will have to be rewritten. See section +\ref{sec:Upgrading} for tips on changing Yapps 1.0 grammars for use +with Yapps 2.0. + +\mysection{Examples} + +In this section are several examples that show the use of Yapps. +First, an introduction shows how to construct grammars and write them +in Yapps form. This example can be skipped by someone familiar with +grammars and parsing. Next is a Lisp expression grammar that produces +a parse tree as output. This example demonstrates the use of tokens +and rules, as well as returning values from rules. The third example +is a expression evaluation grammar that evaluates during parsing +(instead of producing a parse tree). + +\mysubsection{Introduction to Grammars} + +A \emph{grammar} for a natural language specifies how words can be put +together to form large structures, such as phrases and sentences. A +grammar for a computer language is similar in that it specifies how +small components (called \emph{tokens}) can be put together to form +larger structures. In this section we will write a grammar for a tiny +subset of English. + +Simple English sentences can be described as being a noun phrase +followed by a verb followed by a noun phrase. For example, in the +sentence, ``Jack sank the blue ship,'' the word ``Jack'' is the first +noun phrase, ``sank'' is the verb, and ``the blue ship'' is the second +noun phrase. In addition we should say what a noun phrase is; for +this example we shall say that a noun phrase is an optional article +(a, an, the) followed by any number of adjectives followed by a noun. +The tokens in our language are the articles, nouns, verbs, and +adjectives. The \emph{rules} in our language will tell us how to +combine the tokens together to form lists of adjectives, noun phrases, +and sentences: + +\begin{itemize} + \item \texttt{sentence: noun\_phrase verb noun\_phrase} + \item \texttt{noun\_phrase: [article] adjective* noun} +\end{itemize} + +Notice that some things that we said easily in English, such as +``optional article,'' are expressed using special syntax, such as +brackets. When we said, ``any number of adjectives,'' we wrote +\texttt{adjective*}, where the \texttt{*} means ``zero or more of the +preceding pattern''. + +The grammar given above is close to a Yapps grammar. We also have to +specify what the tokens are, and what to do when a pattern is matched. +For this example, we will do nothing when patterns are matched; the +next example will explain how to perform match actions. + +\begin{verbatim} +parser TinyEnglish: + ignore: "\\W+" + token noun: "(Jack|spam|ship)" + token verb: "(sank|threw)" + token article: "(a|an|the)" + token adjective: "(blue|red|green)" + + rule sentence: noun_phrase verb noun_phrase + rule noun_phrase: [article] adjective* noun +\end{verbatim} + +The tokens are specified as Python \emph{regular expressions}. Since +Yapps produces Python code, you can write any regular expression that +would be accepted by Python. (\emph{Note:} These are Python 1.5 +regular expressions from the \texttt{re} module, not Python 1.4 +regular expressions from the \texttt{regex} module.) In addition to +tokens that you want to see (which are given names), you can also +specify tokens to ignore, marked by the \texttt{ignore} keyword. In +this parser we want to ignore whitespace. + +The TinyEnglish grammar shows how you define tokens and rules, but it +does not specify what should happen once we've matched the rules. In +the next example, we will take a grammar and produce a \emph{parse +tree} from it. + +\mysubsection{Lisp Expressions} + +Lisp syntax, although hated by many, has a redeeming quality: it is +simple to parse. In this section we will construct a Yapps grammar to +parse Lisp expressions and produce a parse tree as output. + +\subsubsection*{Defining the Grammar} + +The syntax of Lisp is simple. It has expressions, which are +identifiers, strings, numbers, and lists. A list is a left +parenthesis followed by some number of expressions (separated by +spaces) followed by a right parenthesis. For example, \verb|5|, +\verb|"ni"|, and \verb|(print "1+2 = " (+ 1 2))| are Lisp expressions. +Written as a grammar, + +\begin{verbatim} + expr: ID | STR | NUM | list + list: ( expr* ) +\end{verbatim} + +In addition to having a grammar, we need to specify what to do every +time something is matched. For the tokens, which are strings, we just +want to get the ``value'' of the token, attach its type (identifier, +string, or number) in some way, and return it. For the lists, we want +to construct and return a Python list. + +Once some pattern is matched, we enclose a return statement enclosed +in \verb|{{...}}|. The braces allow us to insert any one-line +statement into the parser. Within this statement, we can refer to the +values returned by matching each part of the rule. After matching a +token such as \texttt{ID}, ``ID'' will be bound to the text of the +matched token. Let's take a look at the rule: + +\begin{verbatim} + rule expr: ID {{ return ('id', ID) }} + ... +\end{verbatim} + +In a rule, tokens return the text that was matched. For identifiers, +we just return the identifier, along with a ``tag'' telling us that +this is an identifier and not a string or some other value. Sometimes +we may need to convert this text to a different form. For example, if +a string is matched, we want to remove quotes and handle special forms +like \verb|\n|. If a number is matched, we want to convert it into a +number. Let's look at the return values for the other tokens: + +\begin{verbatim} + ... + | STR {{ return ('str', eval(STR)) }} + | NUM {{ return ('num', atoi(NUM)) }} + ... +\end{verbatim} + +If we get a string, we want to remove the quotes and process any +special backslash codes, so we run \texttt{eval} on the quoted string. +If we get a number, we convert it to an integer with \texttt{atoi} and +then return the number along with its type tag. + +For matching a list, we need to do something slightly more +complicated. If we match a Lisp list of expressions, we want to +create a Python list with those values. + +\begin{verbatim} + rule list: "\\(" # Match the opening parenthesis + {{ result = [] }} # Create a Python list + ( + expr # When we match an expression, + {{ result.append(expr) }} # add it to the list + )* # * means repeat this if needed + "\\)" # Match the closing parenthesis + {{ return result }} # Return the Python list +\end{verbatim} + +In this rule we first match the opening parenthesis, then go into a +loop. In this loop we match expressions and add them to the list. +When there are no more expressions to match, we match the closing +parenthesis and return the resulting. Note that \verb:#: is used for +comments, just as in Python. + +The complete grammar is specified as follows: +\begin{verbatim} +parser Lisp: + ignore: '\\s+' + token NUM: '[0-9]+' + token ID: '[-+*/!@%^&=.a-zA-Z0-9_]+' + token STR: '"([^\\"]+|\\\\.)*"' + + rule expr: ID {{ return ('id', ID) }} + | STR {{ return ('str', eval(STR)) }} + | NUM {{ return ('num', atoi(NUM)) }} + | list {{ return list }} + rule list: "\\(" {{ result = [] }} + ( expr {{ result.append(expr) }} + )* + "\\)" {{ return result }} +\end{verbatim} + +One thing you may have noticed is that \verb|"\\("| and \verb|"\\)"| +appear in the \texttt{list} rule. These are \emph{inline tokens}: +they appear in the rules without being given a name with the +\texttt{token} keyword. Inline tokens are more convenient to use, but +since they do not have a name, the text that is matched cannot be used +in the return value. They are best used for short simple patterns +(usually punctuation or keywords). + +Another thing to notice is that the number and identifier tokens +overlap. For example, ``487'' matches both NUM and ID. In Yapps, the +scanner only tries to match tokens that are acceptable to the parser. +This rule doesn't help here, since both NUM and ID can appear in the +same place in the grammar. There are two rules used to pick tokens if +more than one matches. One is that the \emph{longest} match is +preferred. For example, ``487x'' will match as an ID (487x) rather +than as a NUM (487) followed by an ID (x). The second rule is that if +the two matches are the same length, the \emph{first} one listed in +the grammar is preferred. For example, ``487'' will match as an NUM +rather than an ID because NUM is listed first in the grammar. Inline +tokens have preference over any tokens you have listed. + +Now that our grammar is defined, we can run Yapps to produce a parser, +and then run the parser to produce a parse tree. + +\subsubsection*{Running Yapps} + +In the Yapps module is a function \texttt{generate} that takes an +input filename and writes a parser to another file. We can use this +function to generate the Lisp parser, which is assumed to be in +\texttt{lisp.g}. + +\begin{verbatim} +% python +Python 1.5.1 (#1, Sep 3 1998, 22:51:17) [GCC 2.7.2.3] on linux-i386 +Copyright 1991-1995 Stichting Mathematisch Centrum, Amsterdam +>>> import yapps +>>> yapps.generate('lisp.g') +\end{verbatim} + +At this point, Yapps has written a file \texttt{lisp.py} that contains +the parser. In that file are two classes (one scanner and one parser) +and a function (called \texttt{parse}) that puts things together for +you. + +Alternatively, we can run Yapps from the command line to generate the +parser file: + +\begin{verbatim} +% python yapps.py lisp.g +\end{verbatim} + +After running Yapps either from within Python or from the command +line, we can use the Lisp parser by calling the \texttt{parse} +function. The first parameter should be the rule we want to match, +and the second parameter should be the string to parse. + +\begin{verbatim} +>>> import lisp +>>> lisp.parse('expr', '(+ 3 4)') +[('id', '+'), ('num', 3), ('num', 4)] +>>> lisp.parse('expr', '(print "3 = " (+ 1 2))') +[('id', 'print'), ('str', '3 = '), [('id', '+'), ('num', 1), ('num', 2)]] +\end{verbatim} + +The \texttt{parse} function is not the only way to use the parser; +section \ref{sec:Parser-Objects} describes how to access parser objects +directly. + +We've now gone through the steps in creating a grammar, writing a +grammar file for Yapps, producing a parser, and using the parser. In +the next example we'll see how rules can take parameters and also how +to do computations instead of just returning a parse tree. + +\mysubsection{Calculator} + +A common example parser given in many textbooks is that for simple +expressions, with numbers, addition, subtraction, multiplication, +division, and parenthesization of subexpressions. We'll write this +example in Yapps, evaluating the expression as we parse. + +Unlike \texttt{yacc}, Yapps does not have any way to specify +precedence rules, so we have to do it ourselves. We say that an +expression is the sum of terms, and that a term is the product of +factors, and that a factor is a number or a parenthesized expression: + +\begin{verbatim} + expr: factor ( ("+"|"-") factor )* + factor: term ( ("*"|"/") term )* + term: NUM | "(" expr ")" +\end{verbatim} + +In order to evaluate the expression as we go, we should keep along an +accumulator while evaluating the lists of terms or factors. Just as +we kept a ``result'' variable to build a parse tree for Lisp +expressions, we will use a variable to evaluate numerical +expressions. The full grammar is given below: + +\begin{verbatim} +parser Calculator: + token END: "$" # $ means end of string + token NUM: "[0-9]+" + + rule goal: expr END {{ return expr }} + + # An expression is the sum and difference of factors + rule expr: factor {{ v = factor }} + ( "[+]" factor {{ v = v+factor }} + | "-" factor {{ v = v-factor }} + )* {{ return v }} + + # A factor is the product and division of terms + rule factor: term {{ v = term }} + ( "[*]" term {{ v = v*term }} + | "/" term {{ v = v/term }} + )* {{ return v }} + + # A term is either a number or an expression surrounded by parentheses + rule term: NUM {{ return atoi(NUM) }} + | "\\(" expr "\\)" {{ return expr }} +\end{verbatim} + +The top-level rule is \emph{goal}, which says that we are looking for +an expression followed by the end of the string. The \texttt{END} +token is needed because without it, it isn't clear when to stop +parsing. For example, the string ``1+3'' could be parsed either as +the expression ``1'' followed by the string ``+3'' or it could be +parsed as the expression ``1+3''. By requiring expressions to end +with \texttt{END}, the parser is forced to take ``1+3''. + +In the two rules with repetition, the accumulator is named \texttt{v}. +After reading in one expression, we initialize the accumulator. Each +time through the loop, we modify the accumulator by adding, +subtracting, multiplying by, or dividing the previous accumulator by +the expression that has been parsed. At the end of the rule, we +return the accumulator. + +The calculator example shows how to process lists of elements using +loops, as well as how to handle precedence of operators. + +\emph{Note:} It's often important to put the \texttt{END} token in, so +put it in unless you are sure that your grammar has some other +non-ambiguous token marking the end of the program. + +\mysubsection{Calculator with Memory} + +In the previous example we learned how to write a calculator that +evaluates simple numerical expressions. In this section we will +extend the example to support both local and global variables. + +To support global variables, we will add assignment statements to the +``goal'' rule. + +\begin{verbatim} + rule goal: expr END {{ return expr }} + | 'set' ID expr END {{ global_vars[ID] = expr }} + {{ return expr }} +\end{verbatim} + +To use these variables, we need a new kind of terminal: + +\begin{verbatim} + rule term: ... | ID {{ return global_vars[ID] }} +\end{verbatim} + +So far, these changes are straightforward. We simply have a global +dictionary \texttt{global\_vars} that stores the variables and values, +we modify it when there is an assignment statement, and we look up +variables in it when we see a variable name. + +To support local variables, we will add variable declarations to the +set of allowed expressions. + +\begin{verbatim} + rule term: ... | 'let' VAR '=' expr 'in' expr ... +\end{verbatim} + +This is where it becomes tricky. Local variables should be stored in +a local dictionary, not in the global one. One trick would be to save +a copy of the global dictionary, modify it, and then restore it +later. In this example we will instead use \emph{attributes} to +create local information and pass it to subrules. + +A rule can optionally take parameters. When we invoke the rule, we +must pass in arguments. For local variables, let's use a single +parameter, \texttt{local\_vars}: + +\begin{verbatim} + rule expr<>: ... + rule factor<>: ... + rule term<>: ... +\end{verbatim} + +Each time we want to match \texttt{expr}, \texttt{factor}, or +\texttt{term}, we will pass the local variables in the current rule to +the subrule. One interesting case is when we pass as an argument +something \emph{other} than \texttt{local\_vars}: + +\begin{verbatim} + rule term<>: ... + | 'let' VAR '=' expr<> + {{ local_vars = [(VAR, expr)] + local_vars }} + 'in' expr<> + {{ return expr }} +\end{verbatim} + +Note that the assignment to the local variables list does not modify +the original list. This is important to keep local variables from +being seen outside the ``let''. + +The other interesting case is when we find a variable: + +\begin{verbatim} +global_vars = {} + +def lookup(map, name): + for x,v in map: if x==name: return v + return global_vars[name] +%% + ... + rule term<: ... + | VAR {{ return lookup(local_vars, VAR) }} +\end{verbatim} + +The lookup function will search through the local variable list, and +if it cannot find the name there, it will look it up in the global +variable dictionary. + +A complete grammar for this example, including a read-eval-print loop +for interacting with the calculator, can be found in the examples +subdirectory included with Yapps. + +In this section we saw how to insert code before the parser. We also +saw how to use attributes to transmit local information from one rule +to its subrules. + +\mysection{Grammars} + +Each Yapps grammar has a name, a list of tokens, and a set of +production rules. A grammar named \texttt{X} will be used to produce +a parser named \texttt{X} and a scanner anmed \texttt{XScanner}. As +in Python, names are case sensitive, start with a letter, and contain +letters, numbers, and underscores (\_). + +There are three kinds of tokens in Yapps: named, inline, and ignored. +As their name implies, named tokens are given a name, using the token +construct: \texttt{token \emph{name} : \emph{regexp}}. In a rule, the +token can be matched by using the name. Inline tokens are regular +expressions that are used in rules without being declared. Ignored +tokens are declared using the ignore construct: \texttt{ignore: + \emph{regexp}}. These tokens are ignored by the scanner, and are +not seen by the parser. Often whitespace is an ignored token. The +regular expressions used to define tokens should use the syntax +defined in the \texttt{re} module, so some symbols may have to be +backslashed. + +Production rules in Yapps have a name and a pattern to match. If the +rule is parameterized, the name should be followed by a list of +parameter names in \verb|<<...>>|. A pattern can be a simple pattern +or a compound pattern. Simple patterns are the name of a named token, +a regular expression in quotes (inline token), the name of a +production rule (followed by arguments in \verb|<<...>>|, if the rule +has parameters), and single line Python statements (\verb|{{...}}|). +Compound patterns are sequences (\verb|A B C ...|), choices ( +\verb:A | B | C | ...:), options (\verb|[...]|), zero-or-more repetitions +(\verb|...*|), and one-or-more repetitions (\verb|...+|). Like +regular expressions, repetition operators have a higher precedence +than sequences, and sequences have a higher precedence than choices. + +Whenever \verb|{{...}}| is used, a legal one-line Python statement +should be put inside the braces. The token \verb|}}| should not +appear within the \verb|{{...}}| section, even within a string, since +Yapps does not attempt to parse the Python statement. A workaround +for strings is to put two strings together (\verb|"}" "}"|), or to use +backslashes (\verb|"}\}"|). At the end of a rule you should use a +\verb|{{ return X }}| statement to return a value. However, you +should \emph{not} use any control statements (\texttt{return}, +\texttt{continue}, \texttt{break}) in the middle of a rule. Yapps +needs to make assumptions about the control flow to generate a parser, +and any changes to the control flow will confuse Yapps. + +The \verb|<<...>>| form can occur in two places: to define parameters +to a rule and to give arguments when matching a rule. Parameters use +the syntax used for Python functions, so they can include default +arguments and the special forms (\verb|*args| and \verb|**kwargs|). +Arguments use the syntax for Python function call arguments, so they +can include normal arguments and keyword arguments. The token +\verb|>>| should not appear within the \verb|<<...>>| section. + +In both the statements and rule arguments, you can use names defined +by the parser to refer to matched patterns. You can refer to the text +matched by a named token by using the token name. You can use the +value returned by a production rule by using the name of that rule. +If a name \texttt{X} is matched more than once (such as in loops), you +will have to save the earlier value(s) in a temporary variable, and +then use that temporary variable in the return value. The next +section has an example of a name that occurs more than once. + +\mysubsection{Left Factoring} +\label{sec:Left-Factoring} + +Yapps produces ELL(1) parsers, which determine which clause to match +based on the first token available. Sometimes the leftmost tokens of +several clauses may be the same. The classic example is the +\emph{if/then/else} construct in Pascal: + +\begin{verbatim} +rule stmt: "if" expr "then" stmt {{ then_part = stmt }} + "else" stmt {{ return ('If',expr,then_part,stmt) }} + | "if" expr "then" stmt {{ return ('If',expr,stmt,[]) }} +\end{verbatim} + +(Note that we have to save the first \texttt{stmt} into a variable +because there is another \texttt{stmt} that will be matched.) The +left portions of the two clauses are the same, which presents a +problem for the parser. The solution is \emph{left-factoring}: the +common parts are put together, and \emph{then} a choice is made about +the remaining part: + +\begin{verbatim} +rule stmt: "if" expr + "then" stmt {{ then_part = stmt }} + {{ else_part = [] }} + [ "else" stmt {{ else_part = stmt }} ] + {{ return ('If', expr, then_part, else_part) }} +\end{verbatim} + +Unfortunately, the classic \emph{if/then/else} situation is +\emph{still} ambiguous when you left-factor. Yapps can deal with this +situation, but will report a warning; see section +\ref{sec:Ambiguous-Grammars} for details. + +In general, replace rules of the form: + +\begin{verbatim} +rule A: a b1 {{ return E1 }} + | a b2 {{ return E2 }} + | c3 {{ return E3 }} + | c4 {{ return E4 }} +\end{verbatim} + +with rules of the form: + +\begin{verbatim} +rule A: a ( b1 {{ return E1 }} + | b2 {{ return E2 }} + ) + | c3 {{ return E3 }} + | c4 {{ return E4 }} +\end{verbatim} + +\mysubsection{Left Recursion} + +A common construct in grammars is for matching a list of patterns, +sometimes separated with delimiters such as commas or semicolons. In +LR-based parser systems, we can parse a list with something like this: + +\begin{verbatim} +rule sum: NUM {{ return NUM }} + | sum "+" NUM {{ return (sum, NUM) }} +\end{verbatim} + +Parsing \texttt{1+2+3+4} would produce the output +\texttt{(((1,2),3),4)}, which is what we want from a left-associative +addition operator. Unfortunately, this grammar is \emph{left +recursive,} because the \texttt{sum} rule contains a clause that +begins with \texttt{sum}. (The recursion occurs at the left side of +the clause.) + +We must restructure this grammar to be \emph{right recursive} instead: + +\begin{verbatim} +rule sum: NUM {{ return NUM }} + | NUM "+" sum {{ return (NUM, sum) }} +\end{verbatim} + +Unfortunately, using this grammar, \texttt{1+2+3+4} would be parsed as +\texttt{(1,(2,(3,4)))}, which no longer follows left associativity. +The rule also needs to be left-factored. Instead, we write the +pattern as a loop instead: + +\begin{verbatim} +rule sum: NUM {{ v = NUM }} + ( "[+]" NUM {{ v = (v,NUM) }} )* + {{ return v }} +\end{verbatim} + +In general, replace rules of the form: + +\begin{verbatim} +rule A: A a1 -> << E1 >> + | A a2 -> << E2 >> + | b3 -> << E3 >> + | b4 -> << E4 >> +\end{verbatim} + +with rules of the form: + +\begin{verbatim} +rule A: ( b3 {{ A = E3 }} + | b4 {{ A = E4 }} ) + ( a1 {{ A = E1 }} + | a2 {{ A = E2 }} )* + {{ return A }} +\end{verbatim} + +We have taken a rule that proved problematic for with recursion and +turned it into a rule that works well with looping constructs. + +\mysubsection{Ambiguous Grammars} +\label{sec:Ambiguous-Grammars} + +In section \ref{sec:Left-Factoring} we saw the classic if/then/else +ambiguity, which occurs because the ``else \ldots'' portion of an ``if +\ldots then \ldots else \ldots'' construct is optional. Programs with +nested if/then/else constructs can be ambiguous when one of the else +clauses is missing: +\begin{verbatim} +if 1 then if 1 then + if 5 then if 5 then + x := 1; x := 1; + else else + y := 9; y := 9; +\end{verbatim} + +The indentation shows that the program can be parsed in two different +ways. (Of course, if we all would adopt Python's indentation-based +structuring, this would never happen!) Usually we want the parsing on +the left: the ``else'' should be associated with the closest ``if'' +statement. In section \ref{sec:Left-Factoring} we ``solved'' the +problem by using the following grammar: + +\begin{verbatim} +rule stmt: "if" expr + "then" stmt {{ then_part = stmt }} + {{ else_part = [] }} + [ "else" stmt {{ else_part = stmt }} ] + {{ return ('If', expr, then_part, else_part) }} +\end{verbatim} + +Here, we have an optional match of ``else'' followed by a statement. +The ambiguity is that if an ``else'' is present, it is not clear +whether you want it parsed immediately or if you want it to be parsed +by the outer ``if''. + +Yapps will deal with the situation by matching when the else pattern +when it can. The parser will work in this case because it prefers the +\emph{first} matching clause, which tells Yapps to parse the ``else''. +That is exactly what we want! + +For ambiguity cases with choices, Yapps will choose the \emph{first} +matching choice. However, remember that Yapps only looks at the first +token to determine its decision, so {\tt (a b | a c)} will result in +Yapps choosing {\tt a b} even when the input is {\tt a c}. It only +looks at the first token, {\tt a}, to make its decision. + +\mysection{Customization} + +Both the parsers and the scanners can be customized. The parser is +usually extended by subclassing, and the scanner can either be +subclassed or completely replaced. + +\mysubsection{Customizing Parsers} + +If additional fields and methods are needed in order for a parser to +work, Python subclassing can be used. (This is unlike parser classes +written in static languages, in which these fields and methods must be +defined in the generated parser class.) We simply subclass the +generated parser, and add any fields or methods required. Expressions +in the grammar can call methods of the subclass to perform any actions +that cannot be expressed as a simple expression. For example, +consider this simple grammar: + +\begin{verbatim} +parser X: + rule goal: "something" {{ self.printmsg() }} +\end{verbatim} + +The \texttt{printmsg} function need not be implemented in the parser +class \texttt{X}; it can be implemented in a subclass: + +\begin{verbatim} +import Xparser + +class MyX(Xparser.X): + def printmsg(self): + print "Hello!" +\end{verbatim} + +\mysubsection{Customizing Scanners} + +The generated parser class is not dependent on the generated scanner +class. A scanner object is passed to the parser object's constructor +in the \texttt{parse} function. To use a different scanner, write +your own function to construct parser objects, with an instance of a +different scanner. Scanner objects must have a \texttt{token} method +that accepts an integer \texttt{N} as well as a list of allowed token +types, and returns the Nth token, as a tuple. The default scanner +raises \texttt{NoMoreTokens} if no tokens are available, and +\texttt{SyntaxError} if no token could be matched. However, the +parser does not rely on these exceptions; only the \texttt{parse} +convenience function (which calls \texttt{wrap\_error\_reporter}) and +the \texttt{print\_error} error display function use those exceptions. + +The tuples representing tokens have four elements. The first two are +the beginning and ending indices of the matched text in the input +string. The third element is the type tag, matching either the name +of a named token or the quoted regexp of an inline or ignored token. +The fourth element of the token tuple is the matched text. If the +input string is \texttt{s}, and the token tuple is +\texttt{(b,e,type,val)}, then \texttt{val} should be equal to +\texttt{s[b:e]}. + +The generated parsers do not the beginning or ending index. They use +only the token type and value. However, the default error reporter +uses the beginning and ending index to show the user where the error +is. + +\mysection{Parser Mechanics} + +The base parser class (Parser) defines two methods, \texttt{\_scan} +and \texttt{\_peek}, and two fields, \texttt{\_pos} and +\texttt{\_scanner}. The generated parser inherits from the base +parser, and contains one method for each rule in the grammar. To +avoid name clashes, do not use names that begin with an underscore +(\texttt{\_}). + +\mysubsection{Parser Objects} +\label{sec:Parser-Objects} + +Yapps produces as output two exception classes, a scanner class, a +parser class, and a function \texttt{parse} that puts everything +together. The \texttt{parse} function does not have to be used; +instead, one can create a parser and scanner object and use them +together for parsing. + +\begin{verbatim} + def parse(rule, text): + P = X(XScanner(text)) + return wrap_error_reporter(P, rule) +\end{verbatim} + +The \texttt{parse} function takes a name of a rule and an input string +as input. It creates a scanner and parser object, then calls +\texttt{wrap\_error\_reporter} to execute the method in the parser +object named \texttt{rule}. The wrapper function will call the +appropriate parser rule and report any parsing errors to standard +output. + +There are several situations in which the \texttt{parse} function +would not be useful. If a different parser or scanner is being used, +or exceptions are to be handled differently, a new \texttt{parse} +function would be required. The supplied \texttt{parse} function can +be used as a template for writing a function for your own needs. An +example of a custom parse function is the \texttt{generate} function +in \texttt{Yapps.py}. + +\mysubsection{Context Sensitive Scanner} + +Unlike most scanners, the scanner produced by Yapps can take into +account the context in which tokens are needed, and try to match only +good tokens. For example, in the grammar: + +\begin{verbatim} +parser IniFile: + token ID: "[a-zA-Z_0-9]+" + token VAL: ".*" + + rule pair: ID "[ \t]*=[ \t]*" VAL "\n" +\end{verbatim} + +we would like to scan lines of text and pick out a name/value pair. +In a conventional scanner, the input string \texttt{shell=progman.exe} +would be turned into a single token of type \texttt{VAL}. The Yapps +scanner, however, knows that at the beginning of the line, an +\texttt{ID} is expected, so it will return \texttt{"shell"} as a token +of type \texttt{ID}. Later, it will return \texttt{"progman.exe"} as +a token of type \texttt{VAL}. + +Context sensitivity decreases the separation between scanner and +parser, but it is useful in parsers like \texttt{IniFile}, where the +tokens themselves are not unambiguous, but \emph{are} unambiguous +given a particular stage in the parsing process. + +Unfortunately, context sensitivity can make it more difficult to +detect errors in the input. For example, in parsing a Pascal-like +language with ``begin'' and ``end'' as keywords, a context sensitive +scanner would only match ``end'' as the END token if the parser is in +a place that will accept the END token. If not, then the scanner +would match ``end'' as an identifier. To disable the context +sensitive scanner in Yapps, add the +\texttt{context-insensitive-scanner} option to the grammar: + +\begin{verbatim} +Parser X: + option: "context-insensitive-scanner" +\end{verbatim} + +Context-insensitive scanning makes the parser look cleaner as well. + +\mysubsection{Internal Variables} + +There are two internal fields that may be of use. The parser object +has two fields, \texttt{\_pos}, which is the index of the current +token being matched, and \texttt{\_scanner}, which is the scanner +object. The token itself can be retrieved by accessing the scanner +object and calling the \texttt{token} method with the token index. However, if you call \texttt{token} before the token has been requested by the parser, it may mess up a context-sensitive scanner.\footnote{When using a context-sensitive scanner, the parser tells the scanner what the valid token types are at each point. If you call \texttt{token} before the parser can tell the scanner the valid token types, the scanner will attempt to match without considering the context.} A +potentially useful combination of these fields is to extract the +portion of the input matched by the current rule. To do this, just save the scanner state (\texttt{\_scanner.pos}) before the text is matched and then again after the text is matched: + +\begin{verbatim} + rule R: + {{ start = self._scanner.pos }} + a b c + {{ end = self._scanner.pos }} + {{ print 'Text is', self._scanner.input[start:end] }} +\end{verbatim} + +\mysubsection{Pre- and Post-Parser Code} + +Sometimes the parser code needs to rely on helper variables, +functions, and classes. A Yapps grammar can optionally be surrounded +by double percent signs, to separate the grammar from Python code. + +\begin{verbatim} +... Python code ... +%% +... Yapps grammar ... +%% +... Python code ... +\end{verbatim} + +The second \verb|%%| can be omitted if there is no Python code at the +end, and the first \verb|%%| can be omitted if there is no extra +Python code at all. (To have code only at the end, both separators +are required.) + +If the second \verb|%%| is omitted, Yapps will insert testing code +that allows you to use the generated parser to parse a file. + +The extended calculator example in the Yapps examples subdirectory +includes both pre-parser and post-parser code. + +\mysubsection{Representation of Grammars} + +For each kind of pattern there is a class derived from Pattern. Yapps +has classes for Terminal, NonTerminal, Sequence, Choice, Option, Plus, +Star, and Eval. Each of these classes has the following interface: + +\begin{itemize} + \item[setup(\emph{gen})] Set accepts-$\epsilon$, and call + \emph{gen.changed()} if it changed. This function can change the + flag from false to true but \emph{not} from true to false. + \item[update(\emph(gen))] Set \first and \follow, and call + \emph{gen.changed()} if either changed. This function can add to + the sets but \emph{not} remove from them. + \item[output(\emph{gen}, \emph{indent})] Generate code for matching + this rule, using \emph{indent} as the current indentation level. + Writes are performed using \emph{gen.write}. + \item[used(\emph{vars})] Given a list of variables \emph{vars}, + return two lists: one containing the variables that are used, and + one containing the variables that are assigned. This function is + used for optimizing the resulting code. +\end{itemize} + +Both \emph{setup} and \emph{update} monotonically increase the +variables they modify. Since the variables can only increase a finite +number of times, we can repeatedly call the function until the +variable stabilized. The \emph{used} function is not currently +implemented. + +With each pattern in the grammar Yapps associates three pieces of +information: the \first set, the \follow set, and the +accepts-$\epsilon$ flag. + +The \first set contains the tokens that can appear as we start +matching the pattern. The \follow set contains the tokens that can +appear immediately after we match the pattern. The accepts-$\epsilon$ +flag is true if the pattern can match no tokens. In this case, \first +will contain all the elements in \follow. The \follow set is not +needed when accepts-$\epsilon$ is false, and may not be accurate in +those cases. + +Yapps does not compute these sets precisely. Its approximation can +miss certain cases, such as this one: + +\begin{verbatim} + rule C: ( A* | B ) + rule B: C [A] +\end{verbatim} + +Yapps will calculate {\tt C}'s \follow set to include {\tt A}. +However, {\tt C} will always match all the {\tt A}'s, so {\tt A} will +never follow it. Yapps 2.0 does not properly handle this construct, +but if it seems important, I may add support for it in a future +version. + +Yapps also cannot handle constructs that depend on the calling +sequence. For example: + +\begin{verbatim} + rule R: U | 'b' + rule S: | 'c' + rule T: S 'b' + rule U: S 'a' +\end{verbatim} + +The \follow set for {\tt S} includes {\tt a} and {\tt b}. Since {\tt + S} can be empty, the \first set for {\tt S} should include {\tt a}, +{\tt b}, and {\tt c}. However, when parsing {\tt R}, if the lookahead +is {\tt b} we should \emph{not} parse {\tt U}. That's because in {\tt + U}, {\tt S} is followed by {\tt a} and not {\tt b}. Therefore in +{\tt R}, we should choose rule {\tt U} only if there is an {\tt a} or +{\tt c}, but not if there is a {\tt b}. Yapps and many other LL(1) +systems do not distinguish {\tt S b} and {\tt S a}, making {\tt + S}'s \follow set {\tt a, b}, and making {\tt R} always try to match +{\tt U}. In this case we can solve the problem by changing {\tt R} to +\verb:'b' | U: but it may not always be possible to solve all such +problems in this way. + +\appendix + +\mysection{Grammar for Parsers} + +This is the grammar for parsers, without any Python code mixed in. +The complete grammar can be found in \texttt{parsedesc.g} in the Yapps +distribution. + +\begin{verbatim} +parser ParserDescription: + ignore: "\\s+" + ignore: "#.*?\r?\n" + token END: "$" # $ means end of string + token ATTR: "<<.+?>>" + token STMT: "{{.+?}}" + token ID: '[a-zA-Z_][a-zA-Z_0-9]*' + token STR: '[rR]?\'([^\\n\'\\\\]|\\\\.)*\'|[rR]?"([^\\n"\\\\]|\\\\.)*"' + + rule Parser: "parser" ID ":" + Options + Tokens + Rules + END + + rule Options: ( "option" ":" STR )* + rule Tokens: ( "token" ID ":" STR | "ignore" ":" STR )* + rule Rules: ( "rule" ID OptParam ":" ClauseA )* + + rule ClauseA: ClauseB ( '[|]' ClauseB )* + rule ClauseB: ClauseC* + rule ClauseC: ClauseD [ '[+]' | '[*]' ] + rule ClauseD: STR | ID [ATTR] | STMT + | '\\(' ClauseA '\\) | '\\[' ClauseA '\\]' +\end{verbatim} + +\mysection{Upgrading} + +Yapps 2.0 is not backwards compatible with Yapps 1.0. In this section +are some tips for upgrading: + +\begin{enumerate} + \item Yapps 1.0 was distributed as a single file. Yapps 2.0 is + instead distributed as two Python files: a \emph{parser generator} + (26k) and a \emph{parser runtime} (5k). You need both files to + create parsers, but you need only the runtime (\texttt{yappsrt.py}) + to use the parsers. + + \item Yapps 1.0 supported Python 1.4 regular expressions from the + \texttt{regex} module. Yapps 2.0 uses Python 1.5 regular + expressions from the \texttt{re} module. \emph{The new syntax for + regular expressions is not compatible with the old syntax.} + Andrew Kuchling has a \htmladdnormallink{guide to converting + regular + expressions}{http://www.python.org/doc/howto/regex-to-re/} on his + web page. + + \item Yapps 1.0 wants a pattern and then a return value in \verb|->| + \verb|<<...>>|. Yapps 2.0 allows patterns and Python statements to + be mixed. To convert a rule like this: + +\begin{verbatim} +rule R: A B C -> << E1 >> + | X Y Z -> << E2 >> +\end{verbatim} + + to Yapps 2.0 form, replace the return value specifiers with return + statements: + +\begin{verbatim} +rule R: A B C {{ return E1 }} + | X Y Z {{ return E2 }} +\end{verbatim} + + \item Yapps 2.0 does not perform tail recursion elimination. This + means any recursive rules you write will be turned into recursive + methods in the parser. The parser will work, but may be slower. + It can be made faster by rewriting recursive rules, using instead + the looping operators \verb|*| and \verb|+| provided in Yapps 2.0. + +\end{enumerate} + +\mysection{Troubleshooting} + +\begin{itemize} + \item A common error is to write a grammar that doesn't have an END + token. End tokens are needed when it is not clear when to stop + parsing. For example, when parsing the expression {\tt 3+5}, it is + not clear after reading {\tt 3} whether to treat it as a complete + expression or whether the parser should continue reading. + Therefore the grammar for numeric expressions should include an end + token. Another example is the grammar for Lisp expressions. In + Lisp, it is always clear when you should stop parsing, so you do + \emph{not} need an end token. In fact, it may be more useful not + to have an end token, so that you can read in several Lisp expressions. + \item If there is a chance of ambiguity, make sure to put the choices + in the order you want them checked. Usually the most specific + choice should be first. Empty sequences should usually be last. + \item The context sensitive scanner is not appropriate for all + grammars. You might try using the insensitive scanner with the + {\tt context-insensitive-scanner} option in the grammar. + \item If performance turns out to be a problem, try writing a custom + scanner. The Yapps scanner is rather slow (but flexible and easy + to understand). +\end{itemize} + +\mysection{History} + +Yapps 1 had several limitations that bothered me while writing +parsers: + +\begin{enumerate} + \item It was not possible to insert statements into the generated + parser. A common workaround was to write an auxilliary function + that executed those statements, and to call that function as part + of the return value calculation. For example, several of my + parsers had an ``append(x,y)'' function that existed solely to call + ``x.append(y)''. + \item The way in which grammars were specified was rather + restrictive: a rule was a choice of clauses. Each clause was a + sequence of tokens and rule names, followed by a return value. + \item Optional matching had to be put into a separate rule because + choices were only made at the beginning of a rule. + \item Repetition had to be specified in terms of recursion. Not only + was this awkward (sometimes requiring additional rules), I had to + add a tail recursion optimization to Yapps to transform the + recursion back into a loop. +\end{enumerate} + +Yapps 2 addresses each of these limitations. + +\begin{enumerate} + \item Statements can occur anywhere within a rule. (However, only + one-line statements are allowed; multiline blocks marked by + indentation are not.) + \item Grammars can be specified using any mix of sequences, choices, + tokens, and rule names. To allow for complex structures, + parentheses can be used for grouping. + \item Given choices and parenthesization, optional matching can be + expressed as a choice between some pattern and nothing. In + addition, Yapps 2 has the convenience syntax \verb|[A B ...]| for + matching \verb|A B ...| optionally. + \item Repetition operators \verb|*| for zero or more and \verb|+| for + one or more make it easy to specify repeating patterns. +\end{enumerate} + +It is my hope that Yapps 2 will be flexible enough to meet my needs +for another year, yet simple enough that I do not hesitate to use it. + +\mysection{Debian Extensions} +\label{sec:debian} + +The Debian version adds the following enhancements to the original +Yapps code. They were written by Matthias Urlichs. + +\begin{enumerate} + \item Yapps can stack input sources ("include files"). A usage example + is supplied with the calc.g sample program. + \item Yapps now understands augmented ignore-able patterns. + This means that Yapps can parse multi-line C comments; this wasn't + possible before. + \item Better error reporting. + \item Yapps now reads its input incrementally. +\end{enumerate} + +The generated parser has been renamed to \texttt{yapps/runtime.py}. +In Debian, this file is provided by the \texttt{yapps2-runtime} package. +You need to depend on it if you Debianize Python programs which use +yapps. + +\mysection{Future Extensions} +\label{sec:future} + +I am still investigating the possibility of LL(2) and higher +lookahead. However, it looks like the resulting parsers will be +somewhat ugly. + +It would be nice to control choices with user-defined predicates. + +The most likely future extension is backtracking. A grammar pattern +like \verb|(VAR ':=' expr)? {{ return Assign(VAR,expr) }} : expr {{ return expr }}| +would turn into code that attempted to match \verb|VAR ':=' expr|. If +it succeeded, it would run \verb|{{ return ... }}|. If it failed, it +would match \verb|expr {{ return expr }}|. Backtracking may make it +less necessary to write LL(2) grammars. + +\mysection{References} + +\begin{enumerate} + \item The \htmladdnormallink{Python-Parser + SIG}{http://www.python.org/sigs/parser-sig/} is the first place + to look for a list of parser systems for Python. + + \item ANTLR/PCCTS, by Terrence Parr, is available at + \htmladdnormallink{The ANTLR Home Page}{http://www.antlr.org/}. + + \item PyLR, by Scott Cotton, is at \htmladdnormallink{his Starship + page}{http://starship.skyport.net/crew/scott/PyLR.html}. + + \item John Aycock's \htmladdnormallink{Compiling Little Languages + Framework}{http://www.foretec.com/python/workshops/1998-11/proceedings/papers/aycock-little/aycock-little.html}. + + \item PyBison, by Scott Hassan, can be found at + \htmladdnormallink{his Python Projects + page}{http://coho.stanford.edu/\~{}hassan/Python/}. + + \item mcf.pars, by Mike C. Fletcher, is available at + \htmladdnormallink{his web + page}{http://members.rogers.com/mcfletch/programming/simpleparse/simpleparse.html}. + + \item kwParsing, by Aaron Watters, is available at + \htmladdnormallink{his Starship + page}{http://starship.skyport.net/crew/aaron_watters/kwParsing/}. +\end{enumerate} + +\end{document} diff --git a/examples/calc.g b/examples/calc.g new file mode 100644 index 0000000..5432855 --- /dev/null +++ b/examples/calc.g @@ -0,0 +1,64 @@ +globalvars = {} # We will store the calculator's variables here + +def lookup(map, name): + for x,v in map: + if x == name: return v + if not globalvars.has_key(name): print 'Undefined (defaulting to 0):', name + return globalvars.get(name, 0) + +def stack_input(scanner,ign): + """Grab more input""" + scanner.stack_input(raw_input(">?> ")) + +%% +parser Calculator: + ignore: "[ \r\t\n]+" + ignore: "[?]" {{ stack_input }} + + token END: "$" + token NUM: "[0-9]+" + token VAR: "[a-zA-Z_]+" + + # Each line can either be an expression or an assignment statement + rule goal: expr<<[]>> END {{ print '=', expr }} + {{ return expr }} + | "set" VAR expr<<[]>> END {{ globalvars[VAR] = expr }} + {{ print VAR, '=', expr }} + {{ return expr }} + + # An expression is the sum and difference of factors + rule expr<>: factor<> {{ n = factor }} + ( "[+]" factor<> {{ n = n+factor }} + | "-" factor<> {{ n = n-factor }} + )* {{ return n }} + + # A factor is the product and division of terms + rule factor<>: term<> {{ v = term }} + ( "[*]" term<> {{ v = v*term }} + | "/" term<> {{ v = v/term }} + )* {{ return v }} + + # A term is a number, variable, or an expression surrounded by parentheses + rule term<>: + NUM {{ return int(NUM) }} + | VAR {{ return lookup(V, VAR) }} + | "\\(" expr "\\)" {{ return expr }} + | "let" VAR "=" expr<> {{ V = [(VAR, expr)] + V }} + "in" expr<> {{ return expr }} +%% +if __name__=='__main__': + print 'Welcome to the calculator sample for Yapps 2.' + print ' Enter either "" or "set ",' + print ' or just press return to exit. An expression can have' + print ' local variables: let x = expr in expr' + # We could have put this loop into the parser, by making the + # `goal' rule use (expr | set var expr)*, but by putting the + # loop into Python code, we can make it interactive (i.e., enter + # one expression, get the result, enter another expression, etc.) + while 1: + try: s = raw_input('>>> ') + except EOFError: break + if not s.strip(): break + parse('goal', s) + print 'Bye.' + diff --git a/examples/expr.g b/examples/expr.g new file mode 100644 index 0000000..ae807b7 --- /dev/null +++ b/examples/expr.g @@ -0,0 +1,21 @@ +parser Calculator: + token END: "$" + token NUM: "[0-9]+" + + rule goal: expr END {{ return expr }} + + # An expression is the sum and difference of factors + rule expr: factor {{ v = factor }} + ( "[+]" factor {{ v = v+factor }} + | "-" factor {{ v = v-factor }} + )* {{ return v }} + + # A factor is the product and division of terms + rule factor: term {{ v = term }} + ( "[*]" term {{ v = v*term }} + | "/" term {{ v = v/term }} + )* {{ return v }} + + # A term is either a number or an expression surrounded by parentheses + rule term: NUM {{ return atoi(NUM) }} + | "\\(" expr "\\)" {{ return expr }} diff --git a/examples/lisp.g b/examples/lisp.g new file mode 100644 index 0000000..551e6f9 --- /dev/null +++ b/examples/lisp.g @@ -0,0 +1,13 @@ +parser Lisp: + ignore: r'\s+' + token NUM: r'[0-9]+' + token ID: r'[-+*/!@$%^&=.a-zA-Z0-9_]+' + token STR: r'"([^\\"]+|\\.)*"' + + rule expr: ID {{ return ('id',ID) }} + | STR {{ return ('str',eval(STR)) }} + | NUM {{ return ('num',int(NUM)) }} + | r"\(" + {{ e = [] }} # initialize the list + ( expr {{ e.append(expr) }} ) * # put each expr into the list + r"\)" {{ return e }} # return the list diff --git a/examples/notes b/examples/notes new file mode 100644 index 0000000..aa45bfb --- /dev/null +++ b/examples/notes @@ -0,0 +1,44 @@ +Hints +##### + +Some additional hints for your edification. + +Author: Matthias Urlichs + +How to process C preprocessor codes: +==================================== + +Rudimentary include handling has been added to the parser by me. + +However, if you want to do anything fancy, like for instance whatever +the C preprocessor does, things get more complicated. Fortunately, +there's already a nice tool to handle C preprocessing -- CPP itself. + +If you want to report errors correctly in that situation, do this: + + def set_line(s,m): + """Fixup the scanner's idea of the current line""" + s.filename = m.group(2) + line = int(m.group(1)) + s.del_line = line - s.line + + %% + parser whatever: + ignore: '^#\s*(\d+)\s*"([^"\n]+)"\s*\n' {{ set_line }} + ignore: '^#.*\n' + + [...] + %% + if __name__=='__main__': + import sys,os + for a in sys.argv[1:]: + f=os.popen("cpp "+repr(a),"r") + + P = whatever(whateverScanner("", filename=a, file=f)) + try: P.goal() + except runtime.SyntaxError, e: + runtime.print_error(e, P._scanner) + sys.exit(1) + + f.close() + diff --git a/examples/xml.g b/examples/xml.g new file mode 100644 index 0000000..cec2709 --- /dev/null +++ b/examples/xml.g @@ -0,0 +1,66 @@ +#!/usr/bin/python2 + +# xml.g +# +# Amit J. Patel, August 2003 +# +# Simple (non-conforming, non-validating) parsing of XML documents, +# based on Robert D. Cameron's "REX" shallow parser. It doesn't +# handle CDATA and lots of other stuff; it's meant to demonstrate +# Yapps, not replace a proper XML parser. + +%% + +parser xml: + token nodetext: r'[^<>]+' + token attrtext_singlequote: "[^']*" + token attrtext_doublequote: '[^"]*' + token SP: r'\s' + token id: r'[a-zA-Z_:][a-zA-Z0-9_:.-]*' + + rule node: + r'' {{ return ['!--comment'] }} + | r'' {{ return ['![CDATA['] }} + | r']*>' {{ return ['!doctype'] }} + | '<' SP* id SP* attributes SP* {{ startid = id }} + ( '>' nodes '' {{ assert startid == id, 'Mismatched tags <%s> ... ' % (startid, id) }} + {{ return [id, attributes] + nodes }} + | '/\s*>' {{ return [id, attributes] }} + ) + | nodetext {{ return nodetext }} + + rule nodes: {{ result = [] }} + ( node {{ result.append(node) }} + ) * {{ return result }} + + rule attribute: id SP* '=' SP* + ( '"' attrtext_doublequote '"' {{ return (id, attrtext_doublequote) }} + | "'" attrtext_singlequote "'" {{ return (id, attrtext_singlequote) }} + ) + + rule attributes: {{ result = {} }} + ( attribute SP* {{ result[attribute[0]] = attribute[1] }} + ) * {{ return result }} + +%% + +if __name__ == '__main__': + tests = ['', + 'some text', + '< bad xml', + '
', + '< spacey a = "foo" / >', + 'text ... ', + ' middle ', + ' foo bar ', + ] + print + print '____Running tests_______________________________________' + for test in tests: + print + try: + parser = xml(xmlScanner(test)) + output = '%s ==> %s' % (repr(test), repr(parser.node())) + except (yappsrt.SyntaxError, AssertionError), e: + output = '%s ==> FAILED ==> %s' % (repr(test), e) + print output diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..0d7c98d --- /dev/null +++ b/setup.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python + +"""Setup script for 'yapps'""" + +from distutils.core import setup + +description = "Yet Another Python Parser System" +long_description = \ +""" +YAPPS is an easy to use parser generator that is written in Python and +generates Python code. There are several parser generator systems +already available for Python, but this parser has different goals: +Yapps is simple, very easy to use, and produces human-readable parsers. + +It is not the fastest or most powerful parser. Yapps is designed to be +used when regular expressions are not enough and other parser systems +are too much: situations where you might otherwise write your own +recursive descent parser. + +This package contains several upward-compatible enhancements to the +original YAPPS source: +- Handle stacked input ("include files") +- augmented ignore-able patterns (can parse multi-line C comments correctly) +- better error reporting +- read input incrementally +""" + +setup (name = "python-yapps", + version = "2.1.1", + description = description, + long_description = long_description, + author = "Amit J. Patel", + author_email = "amitp@cs.stanford.edu", + maintainer = "Matthias Urlichs", + maintainer_email = "smurf@debian.org", + url = "http://theory.stanford.edu/~amitp/yapps/", + license = 'MIT', + platforms = ['POSIX'], + keywords = ['parsing'], + packages = ['yapps'], + #cmdclass = {'bdist_rpm': MyBDist_RPM}, + ) diff --git a/test/empty_clauses.g b/test/empty_clauses.g new file mode 100644 index 0000000..3b435f2 --- /dev/null +++ b/test/empty_clauses.g @@ -0,0 +1,10 @@ +# This parser tests the use of OR clauses with one of them being empty +# +# The output of --dump should indicate the FOLLOW set for (A | ) is 'c'. + +parser Test: + rule TestPlus: ( A | ) 'c' + rule A: 'a'+ + + rule TestStar: ( B | ) 'c' + rule B: 'b'* \ No newline at end of file diff --git a/test/line_numbers.g b/test/line_numbers.g new file mode 100644 index 0000000..19eee9a --- /dev/null +++ b/test/line_numbers.g @@ -0,0 +1,10 @@ +# +# The error messages produced by Yapps have a line number. +# The line number should take the Python code section into account. + +# The line number should be 10. + +%% + +parser error_1: + this_is_an_error; diff --git a/test/option.g b/test/option.g new file mode 100644 index 0000000..5980362 --- /dev/null +++ b/test/option.g @@ -0,0 +1,17 @@ + +%% + +parser test_option: + ignore: r'\s+' + token a: 'a' + token b: 'b' + token EOF: r'$' + + rule test_brackets: a [b] EOF + + rule test_question_mark: a b? EOF + +%% + +# The generated code for test_brackets and test_question_mark should +# be the same. diff --git a/yapps/__init__.py b/yapps/__init__.py new file mode 100644 index 0000000..1bb8bf6 --- /dev/null +++ b/yapps/__init__.py @@ -0,0 +1 @@ +# empty diff --git a/yapps/grammar.py b/yapps/grammar.py new file mode 100644 index 0000000..1714344 --- /dev/null +++ b/yapps/grammar.py @@ -0,0 +1,211 @@ +# grammar.py, part of Yapps 2 - yet another python parser system +# Copyright 1999-2003 by Amit J. Patel +# +# This version of the Yapps 2 grammar can be distributed under the +# terms of the MIT open source license, either found in the LICENSE +# file included with the Yapps distribution +# or at +# +# + +"""Parser for Yapps grammars. + +This file defines the grammar of Yapps grammars. Naturally, it is +implemented in Yapps. The grammar.py module needed by Yapps is built +by running Yapps on yapps_grammar.g. (Holy circularity, Batman!) + +""" + +import sys, re +from yapps import parsetree + +###################################################################### +def cleanup_choice(rule, lst): + if len(lst) == 0: return Sequence(rule, []) + if len(lst) == 1: return lst[0] + return parsetree.Choice(rule, *tuple(lst)) + +def cleanup_sequence(rule, lst): + if len(lst) == 1: return lst[0] + return parsetree.Sequence(rule, *tuple(lst)) + +def resolve_name(rule, tokens, id, args): + if id in [x[0] for x in tokens]: + # It's a token + if args: + print 'Warning: ignoring parameters on TOKEN %s<<%s>>' % (id, args) + return parsetree.Terminal(rule, id) + else: + # It's a name, so assume it's a nonterminal + return parsetree.NonTerminal(rule, id, args) + + +# Begin -- grammar generated by Yapps +import sys, re +from yapps import runtime + +class ParserDescriptionScanner(runtime.Scanner): + patterns = [ + ('"rule"', re.compile('rule')), + ('"ignore"', re.compile('ignore')), + ('"token"', re.compile('token')), + ('"option"', re.compile('option')), + ('":"', re.compile(':')), + ('"parser"', re.compile('parser')), + ('[ \t\r\n]+', re.compile('[ \t\r\n]+')), + ('#.*?\r?\n', re.compile('#.*?\r?\n')), + ('EOF', re.compile('$')), + ('ATTR', re.compile('<<.+?>>')), + ('STMT', re.compile('{{.+?}}')), + ('ID', re.compile('[a-zA-Z_][a-zA-Z_0-9]*')), + ('STR', re.compile('[rR]?\'([^\\n\'\\\\]|\\\\.)*\'|[rR]?"([^\\n"\\\\]|\\\\.)*"')), + ('LP', re.compile('\\(')), + ('RP', re.compile('\\)')), + ('LB', re.compile('\\[')), + ('RB', re.compile('\\]')), + ('OR', re.compile('[|]')), + ('STAR', re.compile('[*]')), + ('PLUS', re.compile('[+]')), + ('QUEST', re.compile('[?]')), + ('COLON', re.compile(':')), + ] + def __init__(self, str,*args,**kw): + runtime.Scanner.__init__(self,None,{'[ \t\r\n]+':None,'#.*?\r?\n':None,},str,*args,**kw) + +class ParserDescription(runtime.Parser): + Context = runtime.Context + def Parser(self, _parent=None): + _context = self.Context(_parent, self._scanner, 'Parser', []) + self._scan('"parser"', context=_context) + ID = self._scan('ID', context=_context) + self._scan('":"', context=_context) + Options = self.Options(_context) + Tokens = self.Tokens(_context) + Rules = self.Rules(Tokens, _context) + EOF = self._scan('EOF', context=_context) + return parsetree.Generator(ID,Options,Tokens,Rules) + + def Options(self, _parent=None): + _context = self.Context(_parent, self._scanner, 'Options', []) + opt = {} + while self._peek('"option"', '"token"', '"ignore"', 'EOF', '"rule"', context=_context) == '"option"': + self._scan('"option"', context=_context) + self._scan('":"', context=_context) + Str = self.Str(_context) + opt[Str] = 1 + return opt + + def Tokens(self, _parent=None): + _context = self.Context(_parent, self._scanner, 'Tokens', []) + tok = [] + while self._peek('"token"', '"ignore"', 'EOF', '"rule"', context=_context) in ['"token"', '"ignore"']: + _token = self._peek('"token"', '"ignore"', context=_context) + if _token == '"token"': + self._scan('"token"', context=_context) + ID = self._scan('ID', context=_context) + self._scan('":"', context=_context) + Str = self.Str(_context) + tok.append( (ID,Str) ) + else: # == '"ignore"' + self._scan('"ignore"', context=_context) + self._scan('":"', context=_context) + Str = self.Str(_context) + ign = ('#ignore',Str) + if self._peek('STMT', '"token"', '"ignore"', 'EOF', '"rule"', context=_context) == 'STMT': + STMT = self._scan('STMT', context=_context) + ign = ign + (STMT[2:-2],) + tok.append( ign ) + return tok + + def Rules(self, tokens, _parent=None): + _context = self.Context(_parent, self._scanner, 'Rules', [tokens]) + rul = [] + while self._peek('"rule"', 'EOF', context=_context) == '"rule"': + self._scan('"rule"', context=_context) + ID = self._scan('ID', context=_context) + OptParam = self.OptParam(_context) + self._scan('":"', context=_context) + ClauseA = self.ClauseA(ID, tokens, _context) + rul.append( (ID, OptParam, ClauseA) ) + return rul + + def ClauseA(self, rule, tokens, _parent=None): + _context = self.Context(_parent, self._scanner, 'ClauseA', [rule, tokens]) + ClauseB = self.ClauseB(rule,tokens, _context) + v = [ClauseB] + while self._peek('OR', 'RP', 'RB', '"rule"', 'EOF', context=_context) == 'OR': + OR = self._scan('OR', context=_context) + ClauseB = self.ClauseB(rule,tokens, _context) + v.append(ClauseB) + return cleanup_choice(rule,v) + + def ClauseB(self, rule,tokens, _parent=None): + _context = self.Context(_parent, self._scanner, 'ClauseB', [rule,tokens]) + v = [] + while self._peek('STR', 'ID', 'LP', 'LB', 'STMT', 'OR', 'RP', 'RB', '"rule"', 'EOF', context=_context) in ['STR', 'ID', 'LP', 'LB', 'STMT']: + ClauseC = self.ClauseC(rule,tokens, _context) + v.append(ClauseC) + return cleanup_sequence(rule, v) + + def ClauseC(self, rule,tokens, _parent=None): + _context = self.Context(_parent, self._scanner, 'ClauseC', [rule,tokens]) + ClauseD = self.ClauseD(rule,tokens, _context) + _token = self._peek('PLUS', 'STAR', 'QUEST', 'STR', 'ID', 'LP', 'LB', 'STMT', 'OR', 'RP', 'RB', '"rule"', 'EOF', context=_context) + if _token == 'PLUS': + PLUS = self._scan('PLUS', context=_context) + return parsetree.Plus(rule, ClauseD) + elif _token == 'STAR': + STAR = self._scan('STAR', context=_context) + return parsetree.Star(rule, ClauseD) + elif _token == 'QUEST': + QUEST = self._scan('QUEST', context=_context) + return parsetree.Option(rule, ClauseD) + else: + return ClauseD + + def ClauseD(self, rule,tokens, _parent=None): + _context = self.Context(_parent, self._scanner, 'ClauseD', [rule,tokens]) + _token = self._peek('STR', 'ID', 'LP', 'LB', 'STMT', context=_context) + if _token == 'STR': + STR = self._scan('STR', context=_context) + t = (STR, eval(STR,{},{})) + if t not in tokens: tokens.insert( 0, t ) + return parsetree.Terminal(rule, STR) + elif _token == 'ID': + ID = self._scan('ID', context=_context) + OptParam = self.OptParam(_context) + return resolve_name(rule,tokens, ID, OptParam) + elif _token == 'LP': + LP = self._scan('LP', context=_context) + ClauseA = self.ClauseA(rule,tokens, _context) + RP = self._scan('RP', context=_context) + return ClauseA + elif _token == 'LB': + LB = self._scan('LB', context=_context) + ClauseA = self.ClauseA(rule,tokens, _context) + RB = self._scan('RB', context=_context) + return parsetree.Option(rule, ClauseA) + else: # == 'STMT' + STMT = self._scan('STMT', context=_context) + return parsetree.Eval(rule, STMT[2:-2]) + + def OptParam(self, _parent=None): + _context = self.Context(_parent, self._scanner, 'OptParam', []) + if self._peek('ATTR', '":"', 'PLUS', 'STAR', 'QUEST', 'STR', 'ID', 'LP', 'LB', 'STMT', 'OR', 'RP', 'RB', '"rule"', 'EOF', context=_context) == 'ATTR': + ATTR = self._scan('ATTR', context=_context) + return ATTR[2:-2] + return '' + + def Str(self, _parent=None): + _context = self.Context(_parent, self._scanner, 'Str', []) + STR = self._scan('STR', context=_context) + return eval(STR,{},{}) + + +def parse(rule, text): + P = ParserDescription(ParserDescriptionScanner(text)) + return runtime.wrap_error_reporter(P, rule) + +# End -- grammar generated by Yapps + + diff --git a/yapps/parsetree.py b/yapps/parsetree.py new file mode 100644 index 0000000..e5e0ae0 --- /dev/null +++ b/yapps/parsetree.py @@ -0,0 +1,673 @@ +# parsetree.py, part of Yapps 2 - yet another python parser system +# Copyright 1999-2003 by Amit J. Patel +# +# This version of the Yapps 2 Runtime can be distributed under the +# terms of the MIT open source license, either found in the LICENSE file +# included with the Yapps distribution +# or at +# +# + +"""Classes used to represent parse trees and generate output. + +This module defines the Generator class, which drives the generation +of Python output from a grammar parse tree. It also defines nodes +used to represent the parse tree; they are derived from class Node. + +The main logic of Yapps is in this module. +""" + +import sys, re + +###################################################################### +INDENT = ' '*4 +class Generator: + + # TODO: many of the methods here should be class methods, not instance methods + + def __init__(self, name, options, tokens, rules): + self.change_count = 0 + self.name = name + self.options = options + self.preparser = '' + self.postparser = None + + self.tokens = {} # Map from tokens to regexps + self.ignore = {} # List of token names to ignore in parsing, map to statements + self.terminals = [] # List of token names (to maintain ordering) + for t in tokens: + if len(t) == 3: + n,t,s = t + else: + n,t = t + s = None + + if n == '#ignore': + n = t + self.ignore[n] = s + if n in self.tokens.keys() and self.tokens[n] != t: + print >>sys.stderr, 'Warning: token %s defined more than once.' % n + self.tokens[n] = t + self.terminals.append(n) + + self.rules = {} # Map from rule names to parser nodes + self.params = {} # Map from rule names to parameters + self.goals = [] # List of rule names (to maintain ordering) + for n,p,r in rules: + self.params[n] = p + self.rules[n] = r + self.goals.append(n) + + self.output = sys.stdout + + def has_option(self, name): + return self.options.get(name, 0) + + def non_ignored_tokens(self): + return [x for x in self.terminals if x not in self.ignore] + + def changed(self): + """Increments the change count. + + >>> t = Generator('', [], [], []) + >>> old_count = t.change_count + >>> t.changed() + >>> assert t.change_count == old_count + 1 + """ + self.change_count = 1+self.change_count + + def set_subtract(self, a, b): + """Returns the elements of a that are not in b. + + >>> t = Generator('', [], [], []) + >>> t.set_subtract([], []) + [] + >>> t.set_subtract([1, 2], [1, 2]) + [] + >>> t.set_subtract([1, 2, 3], [2]) + [1, 3] + >>> t.set_subtract([1], [2, 3, 4]) + [1] + """ + result = [] + for x in a: + if x not in b: + result.append(x) + return result + + def subset(self, a, b): + """True iff all elements of sequence a are inside sequence b + + >>> t = Generator('', [], [], []) + >>> t.subset([], [1, 2, 3]) + 1 + >>> t.subset([1, 2, 3], []) + 0 + >>> t.subset([1], [1, 2, 3]) + 1 + >>> t.subset([3, 2, 1], [1, 2, 3]) + 1 + >>> t.subset([1, 1, 1], [1, 2, 3]) + 1 + >>> t.subset([1, 2, 3], [1, 1, 1]) + 0 + """ + for x in a: + if x not in b: + return 0 + return 1 + + def equal_set(self, a, b): + """True iff subset(a, b) and subset(b, a) + + >>> t = Generator('', [], [], []) + >>> a_set = [1, 2, 3] + >>> t.equal_set(a_set, a_set) + 1 + >>> t.equal_set(a_set, a_set[:]) + 1 + >>> t.equal_set([], a_set) + 0 + >>> t.equal_set([1, 2, 3], [3, 2, 1]) + 1 + """ + if len(a) != len(b): return 0 + if a == b: return 1 + return self.subset(a, b) and self.subset(b, a) + + def add_to(self, parent, additions): + "Modify _parent_ to include all elements in _additions_" + for x in additions: + if x not in parent: + parent.append(x) + self.changed() + + def equate(self, a, b): + """Extend (a) and (b) so that they contain each others' elements. + + >>> t = Generator('', [], [], []) + >>> a = [1, 2] + >>> b = [2, 3] + >>> t.equate(a, b) + >>> a + [1, 2, 3] + >>> b + [2, 3, 1] + """ + self.add_to(a, b) + self.add_to(b, a) + + def write(self, *args): + for a in args: + self.output.write(a) + + def in_test(self, expr, full, set): + """Generate a test of (expr) being in (set), where (set) is a subset of (full) + + expr is a string (Python expression) + set is a list of values (which will be converted with repr) + full is the list of all values expr could possibly evaluate to + + >>> t = Generator('', [], [], []) + >>> t.in_test('x', [1,2,3,4], []) + '0' + >>> t.in_test('x', [1,2,3,4], [1,2,3,4]) + '1' + >>> t.in_test('x', [1,2,3,4], [1]) + 'x == 1' + >>> t.in_test('a+b', [1,2,3,4], [1,2]) + 'a+b in [1, 2]' + >>> t.in_test('x', [1,2,3,4,5], [1,2,3]) + 'x not in [4, 5]' + >>> t.in_test('x', [1,2,3,4,5], [1,2,3,4]) + 'x != 5' + """ + + if not set: return '0' + if len(set) == 1: return '%s == %s' % (expr, repr(set[0])) + if full and len(set) > len(full)/2: + # Reverse the sense of the test. + not_set = [x for x in full if x not in set] + return self.not_in_test(expr, full, not_set) + return '%s in %s' % (expr, repr(set)) + + def not_in_test(self, expr, full, set): + """Like in_test, but the reverse test.""" + if not set: return '1' + if len(set) == 1: return '%s != %s' % (expr, repr(set[0])) + return '%s not in %s' % (expr, repr(set)) + + def peek_call(self, a): + """Generate a call to scan for a token in the set 'a'""" + assert type(a) == type([]) + a_set = (repr(a)[1:-1]) + if self.equal_set(a, self.non_ignored_tokens()): a_set = '' + if self.has_option('context-insensitive-scanner'): a_set = '' + if a_set: a_set += "," + + return 'self._peek(%s context=_context)' % a_set + + def peek_test(self, a, b): + """Generate a call to test whether the next token (which could be any of + the elements in a) is in the set b.""" + if self.subset(a, b): return '1' + if self.has_option('context-insensitive-scanner'): a = self.non_ignored_tokens() + return self.in_test(self.peek_call(a), a, b) + + def not_peek_test(self, a, b): + """Like peek_test, but the opposite sense.""" + if self.subset(a, b): return '0' + return self.not_in_test(self.peek_call(a), a, b) + + def calculate(self): + """The main loop to compute the epsilon, first, follow sets. + The loop continues until the sets converge. This works because + each set can only get larger, so when they stop getting larger, + we're done.""" + # First we determine whether a rule accepts epsilon (the empty sequence) + while 1: + for r in self.goals: + self.rules[r].setup(self) + if self.change_count == 0: break + self.change_count = 0 + + # Now we compute the first/follow sets + while 1: + for r in self.goals: + self.rules[r].update(self) + if self.change_count == 0: break + self.change_count = 0 + + def dump_information(self): + """Display the grammar in somewhat human-readable form.""" + self.calculate() + for r in self.goals: + print ' _____' + '_'*len(r) + print ('___/Rule '+r+'\\' + '_'*80)[:79] + queue = [self.rules[r]] + while queue: + top = queue[0] + del queue[0] + + print 'Rule', repr(top), 'of class', top.__class__.__name__ + top.first.sort() + top.follow.sort() + eps = [] + if top.accepts_epsilon: eps = ['(null)'] + print ' FIRST:', ', '.join(top.first+eps) + print ' FOLLOW:', ', '.join(top.follow) + for x in top.get_children(): queue.append(x) + + def repr_ignore(self): + out="{" + for t,s in self.ignore.iteritems(): + if s is None: s=repr(s) + out += "%s:%s," % (repr(t),s) + out += "}" + return out + + def generate_output(self): + self.calculate() + self.write(self.preparser) + self.write("# Begin -- grammar generated by Yapps\n") + self.write("import sys, re\n") + self.write("from yapps import runtime\n") + self.write("\n") + self.write("class ", self.name, "Scanner(runtime.Scanner):\n") + self.write(" patterns = [\n") + for p in self.terminals: + self.write(" (%s, re.compile(%s)),\n" % ( + repr(p), repr(self.tokens[p]))) + self.write(" ]\n") + self.write(" def __init__(self, str,*args,**kw):\n") + self.write(" runtime.Scanner.__init__(self,None,%s,str,*args,**kw)\n" % + self.repr_ignore()) + self.write("\n") + + self.write("class ", self.name, "(runtime.Parser):\n") + self.write(INDENT, "Context = runtime.Context\n") + for r in self.goals: + self.write(INDENT, "def ", r, "(self") + if self.params[r]: self.write(", ", self.params[r]) + self.write(", _parent=None):\n") + self.write(INDENT+INDENT, "_context = self.Context(_parent, self._scanner, %s, [%s])\n" % + (repr(r), self.params.get(r, ''))) + self.rules[r].output(self, INDENT+INDENT) + self.write("\n") + + self.write("\n") + self.write("def parse(rule, text):\n") + self.write(" P = ", self.name, "(", self.name, "Scanner(text))\n") + self.write(" return runtime.wrap_error_reporter(P, rule)\n") + self.write("\n") + if self.postparser is not None: + self.write("# End -- grammar generated by Yapps\n") + self.write(self.postparser) + else: + self.write("if __name__ == '__main__':\n") + self.write(INDENT, "from sys import argv, stdin\n") + self.write(INDENT, "if len(argv) >= 2:\n") + self.write(INDENT*2, "if len(argv) >= 3:\n") + self.write(INDENT*3, "f = open(argv[2],'r')\n") + self.write(INDENT*2, "else:\n") + self.write(INDENT*3, "f = stdin\n") + self.write(INDENT*2, "print parse(argv[1], f.read())\n") + self.write(INDENT, "else: print >>sys.stderr, 'Args: []'\n") + self.write("# End -- grammar generated by Yapps\n") + +###################################################################### +class Node: + """This is the base class for all components of a grammar.""" + def __init__(self, rule): + self.rule = rule # name of the rule containing this node + self.first = [] + self.follow = [] + self.accepts_epsilon = 0 + + def setup(self, gen): + # Setup will change accepts_epsilon, + # sometimes from 0 to 1 but never 1 to 0. + # It will take a finite number of steps to set things up + pass + + def used(self, vars): + "Return two lists: one of vars used, and the other of vars assigned" + return vars, [] + + def get_children(self): + "Return a list of sub-nodes" + return [] + + def __repr__(self): + return str(self) + + def update(self, gen): + if self.accepts_epsilon: + gen.add_to(self.first, self.follow) + + def output(self, gen, indent): + "Write out code to _gen_ with _indent_:string indentation" + gen.write(indent, "assert 0 # Invalid parser node\n") + +class Terminal(Node): + """This class stores terminal nodes, which are tokens.""" + def __init__(self, rule, token): + Node.__init__(self, rule) + self.token = token + self.accepts_epsilon = 0 + + def __str__(self): + return self.token + + def update(self, gen): + Node.update(self, gen) + if self.first != [self.token]: + self.first = [self.token] + gen.changed() + + def output(self, gen, indent): + gen.write(indent) + if re.match('[a-zA-Z_][a-zA-Z_0-9]*$', self.token): + gen.write(self.token, " = ") + gen.write("self._scan(%s, context=_context)\n" % repr(self.token)) + +class Eval(Node): + """This class stores evaluation nodes, from {{ ... }} clauses.""" + def __init__(self, rule, expr): + Node.__init__(self, rule) + self.expr = expr + + def setup(self, gen): + Node.setup(self, gen) + if not self.accepts_epsilon: + self.accepts_epsilon = 1 + gen.changed() + + def __str__(self): + return '{{ %s }}' % self.expr.strip() + + def output(self, gen, indent): + gen.write(indent, self.expr.strip(), '\n') + +class NonTerminal(Node): + """This class stores nonterminal nodes, which are rules with arguments.""" + def __init__(self, rule, name, args): + Node.__init__(self, rule) + self.name = name + self.args = args + + def setup(self, gen): + Node.setup(self, gen) + try: + self.target = gen.rules[self.name] + if self.accepts_epsilon != self.target.accepts_epsilon: + self.accepts_epsilon = self.target.accepts_epsilon + gen.changed() + except KeyError: # Oops, it's nonexistent + print >>sys.stderr, 'Error: no rule <%s>' % self.name + self.target = self + + def __str__(self): + return '%s' % self.name + + def update(self, gen): + Node.update(self, gen) + gen.equate(self.first, self.target.first) + gen.equate(self.follow, self.target.follow) + + def output(self, gen, indent): + gen.write(indent) + gen.write(self.name, " = ") + args = self.args + if args: args += ', ' + args += '_context' + gen.write("self.", self.name, "(", args, ")\n") + +class Sequence(Node): + """This class stores a sequence of nodes (A B C ...)""" + def __init__(self, rule, *children): + Node.__init__(self, rule) + self.children = children + + def setup(self, gen): + Node.setup(self, gen) + for c in self.children: c.setup(gen) + + if not self.accepts_epsilon: + # If it's not already accepting epsilon, it might now do so. + for c in self.children: + # any non-epsilon means all is non-epsilon + if not c.accepts_epsilon: break + else: + self.accepts_epsilon = 1 + gen.changed() + + def get_children(self): + return self.children + + def __str__(self): + return '( %s )' % ' '.join(map(str, self.children)) + + def update(self, gen): + Node.update(self, gen) + for g in self.children: + g.update(gen) + + empty = 1 + for g_i in range(len(self.children)): + g = self.children[g_i] + + if empty: gen.add_to(self.first, g.first) + if not g.accepts_epsilon: empty = 0 + + if g_i == len(self.children)-1: + next = self.follow + else: + next = self.children[1+g_i].first + gen.add_to(g.follow, next) + + if self.children: + gen.add_to(self.follow, self.children[-1].follow) + + def output(self, gen, indent): + if self.children: + for c in self.children: + c.output(gen, indent) + else: + # Placeholder for empty sequences, just in case + gen.write(indent, 'pass\n') + +class Choice(Node): + """This class stores a choice between nodes (A | B | C | ...)""" + def __init__(self, rule, *children): + Node.__init__(self, rule) + self.children = children + + def setup(self, gen): + Node.setup(self, gen) + for c in self.children: c.setup(gen) + + if not self.accepts_epsilon: + for c in self.children: + if c.accepts_epsilon: + self.accepts_epsilon = 1 + gen.changed() + + def get_children(self): + return self.children + + def __str__(self): + return '( %s )' % ' | '.join(map(str, self.children)) + + def update(self, gen): + Node.update(self, gen) + for g in self.children: + g.update(gen) + + for g in self.children: + gen.add_to(self.first, g.first) + gen.add_to(self.follow, g.follow) + for g in self.children: + gen.add_to(g.follow, self.follow) + if self.accepts_epsilon: + gen.add_to(self.first, self.follow) + + def output(self, gen, indent): + test = "if" + gen.write(indent, "_token = ", gen.peek_call(self.first), "\n") + tokens_seen = [] + tokens_unseen = self.first[:] + if gen.has_option('context-insensitive-scanner'): + # Context insensitive scanners can return ANY token, + # not only the ones in first. + tokens_unseen = gen.non_ignored_tokens() + for c in self.children: + testset = c.first[:] + removed = [] + for x in testset: + if x in tokens_seen: + testset.remove(x) + removed.append(x) + if x in tokens_unseen: tokens_unseen.remove(x) + tokens_seen = tokens_seen + testset + if removed: + if not testset: + print >>sys.stderr, 'Error in rule', self.rule+':' + else: + print >>sys.stderr, 'Warning in rule', self.rule+':' + print >>sys.stderr, ' *', self + print >>sys.stderr, ' * These tokens could be matched by more than one clause:' + print >>sys.stderr, ' *', ' '.join(removed) + + if testset: + if not tokens_unseen: # context sensitive scanners only! + if test == 'if': + # if it's the first AND last test, then + # we can simply put the code without an if/else + c.output(gen, indent) + else: + gen.write(indent, "else:") + t = gen.in_test('', [], testset) + if len(t) < 70-len(indent): + gen.write(' #', t) + gen.write("\n") + c.output(gen, indent+INDENT) + else: + gen.write(indent, test, " ", + gen.in_test('_token', tokens_unseen, testset), + ":\n") + c.output(gen, indent+INDENT) + test = "elif" + + if tokens_unseen: + gen.write(indent, "else:\n") + gen.write(indent, INDENT, "raise runtime.SyntaxError(_token[0], ") + gen.write("'Could not match ", self.rule, "')\n") + +class Wrapper(Node): + """This is a base class for nodes that modify a single child.""" + def __init__(self, rule, child): + Node.__init__(self, rule) + self.child = child + + def setup(self, gen): + Node.setup(self, gen) + self.child.setup(gen) + + def get_children(self): + return [self.child] + + def update(self, gen): + Node.update(self, gen) + self.child.update(gen) + gen.add_to(self.first, self.child.first) + gen.equate(self.follow, self.child.follow) + +class Option(Wrapper): + """This class represents an optional clause of the form [A]""" + def setup(self, gen): + Wrapper.setup(self, gen) + if not self.accepts_epsilon: + self.accepts_epsilon = 1 + gen.changed() + + def __str__(self): + return '[ %s ]' % str(self.child) + + def output(self, gen, indent): + if self.child.accepts_epsilon: + print >>sys.stderr, 'Warning in rule', self.rule+': contents may be empty.' + gen.write(indent, "if %s:\n" % + gen.peek_test(self.first, self.child.first)) + self.child.output(gen, indent+INDENT) + + if gen.has_option('context-insensitive-scanner'): + gen.write(indent, "if %s:\n" % + gen.not_peek_test(gen.non_ignored_tokens(), self.follow)) + gen.write(indent+INDENT, "raise runtime.SyntaxError(pos=self._scanner.get_pos(), context=_context, msg='Need one of ' + ', '.join(%s))\n" % + repr(self.first)) + + +class Plus(Wrapper): + """This class represents a 1-or-more repetition clause of the form A+""" + def setup(self, gen): + Wrapper.setup(self, gen) + if self.accepts_epsilon != self.child.accepts_epsilon: + self.accepts_epsilon = self.child.accepts_epsilon + gen.changed() + + def __str__(self): + return '%s+' % str(self.child) + + def update(self, gen): + Wrapper.update(self, gen) + gen.add_to(self.child.follow, self.child.first) + + def output(self, gen, indent): + if self.child.accepts_epsilon: + print >>sys.stderr, 'Warning in rule', self.rule+':' + print >>sys.stderr, ' * The repeated pattern could be empty. The resulting parser may not work properly.' + gen.write(indent, "while 1:\n") + self.child.output(gen, indent+INDENT) + union = self.first[:] + gen.add_to(union, self.follow) + gen.write(indent+INDENT, "if %s: break\n" % + gen.not_peek_test(union, self.child.first)) + + if gen.has_option('context-insensitive-scanner'): + gen.write(indent, "if %s:\n" % + gen.not_peek_test(gen.non_ignored_tokens(), self.follow)) + gen.write(indent+INDENT, "raise runtime.SyntaxError(pos=self._scanner.get_pos(), context=_context, msg='Need one of ' + ', '.join(%s))\n" % + repr(self.first)) + + +class Star(Wrapper): + """This class represents a 0-or-more repetition clause of the form A*""" + def setup(self, gen): + Wrapper.setup(self, gen) + if not self.accepts_epsilon: + self.accepts_epsilon = 1 + gen.changed() + + def __str__(self): + return '%s*' % str(self.child) + + def update(self, gen): + Wrapper.update(self, gen) + gen.add_to(self.child.follow, self.child.first) + + def output(self, gen, indent): + if self.child.accepts_epsilon: + print >>sys.stderr, 'Warning in rule', self.rule+':' + print >>sys.stderr, ' * The repeated pattern could be empty. The resulting parser probably will not work properly.' + gen.write(indent, "while %s:\n" % + gen.peek_test(self.follow, self.child.first)) + self.child.output(gen, indent+INDENT) + + # TODO: need to generate tests like this in lots of rules + if gen.has_option('context-insensitive-scanner'): + gen.write(indent, "if %s:\n" % + gen.not_peek_test(gen.non_ignored_tokens(), self.follow)) + gen.write(indent+INDENT, "raise runtime.SyntaxError(pos=self._scanner.get_pos(), context=_context, msg='Need one of ' + ', '.join(%s))\n" % + repr(self.first)) + diff --git a/yapps/runtime.py b/yapps/runtime.py new file mode 100644 index 0000000..5d9d1d6 --- /dev/null +++ b/yapps/runtime.py @@ -0,0 +1,442 @@ +# Yapps 2 Runtime, part of Yapps 2 - yet another python parser system +# Copyright 1999-2003 by Amit J. Patel +# Enhancements copyright 2003-2004 by Matthias Urlichs +# +# This version of the Yapps 2 Runtime can be distributed under the +# terms of the MIT open source license, either found in the LICENSE file +# included with the Yapps distribution +# or at +# +# + +"""Run time libraries needed to run parsers generated by Yapps. + +This module defines parse-time exception classes, a scanner class, a +base class for parsers produced by Yapps, and a context class that +keeps track of the parse stack. + +""" + +import sys, re + +MIN_WINDOW=4096 +# File lookup window + +class SyntaxError(Exception): + """When we run into an unexpected token, this is the exception to use""" + def __init__(self, pos=None, msg="Bad Token", context=None): + Exception.__init__(self) + self.pos = pos + self.msg = msg + self.context = context + + def __str__(self): + if not self.pos: return 'SyntaxError' + else: return 'SyntaxError@%s(%s)' % (repr(self.pos), self.msg) + +class NoMoreTokens(Exception): + """Another exception object, for when we run out of tokens""" + pass + +class Token(object): + """Yapps token. + + This is a container for a scanned token. + """ + + def __init__(self, type,value, pos=None): + """Initialize a token.""" + self.type = type + self.value = value + self.pos = pos + + def __repr__(self): + output = '<%s: %s' % (self.type, repr(self.value)) + if self.pos: + output += " @ " + if self.pos[0]: + output += "%s:" % self.pos[0] + if self.pos[1]: + output += "%d" % self.pos[1] + if self.pos[2] is not None: + output += ".%d" % self.pos[2] + output += ">" + return output + +in_name=0 +class Scanner(object): + """Yapps scanner. + + The Yapps scanner can work in context sensitive or context + insensitive modes. The token(i) method is used to retrieve the + i-th token. It takes a restrict set that limits the set of tokens + it is allowed to return. In context sensitive mode, this restrict + set guides the scanner. In context insensitive mode, there is no + restriction (the set is always the full set of tokens). + + """ + + def __init__(self, patterns, ignore, input="", + file=None,filename=None,stacked=False): + """Initialize the scanner. + + Parameters: + patterns : [(terminal, uncompiled regex), ...] or None + ignore : {terminal:None, ...} + input : string + + If patterns is None, we assume that the subclass has + defined self.patterns : [(terminal, compiled regex), ...]. + Note that the patterns parameter expects uncompiled regexes, + whereas the self.patterns field expects compiled regexes. + + The 'ignore' value is either None or a callable, which is called + with the scanner and the to-be-ignored match object; this can + be used for include file or comment handling. + """ + + if not filename: + global in_name + filename="" % in_name + in_name += 1 + + self.input = input + self.ignore = ignore + self.file = file + self.filename = filename + self.pos = 0 + self.del_pos = 0 # skipped + self.line = 1 + self.del_line = 0 # skipped + self.col = 0 + self.tokens = [] + self.stack = None + self.stacked = stacked + + self.last_read_token = None + self.last_token = None + self.last_types = None + + if patterns is not None: + # Compile the regex strings into regex objects + self.patterns = [] + for terminal, regex in patterns: + self.patterns.append( (terminal, re.compile(regex)) ) + + def stack_input(self, input="", file=None, filename=None): + """Temporarily parse from a second file.""" + + # Already reading from somewhere else: Go on top of that, please. + if self.stack: + # autogenerate a recursion-level-identifying filename + if not filename: + filename = 1 + else: + try: + filename += 1 + except TypeError: + pass + # now pass off to the include file + self.stack.stack_input(input,file,filename) + else: + + try: + filename += 0 + except TypeError: + pass + else: + filename = "" % filename + +# self.stack = object.__new__(self.__class__) +# Scanner.__init__(self.stack,self.patterns,self.ignore,input,file,filename, stacked=True) + + # Note that the pattern+ignore are added by the generated + # scanner code + self.stack = self.__class__(input,file,filename, stacked=True) + + def get_pos(self): + """Return a file/line/char tuple.""" + if self.stack: return self.stack.get_pos() + + return (self.filename, self.line+self.del_line, self.col) + +# def __repr__(self): +# """Print the last few tokens that have been scanned in""" +# output = '' +# for t in self.tokens: +# output += '%s\n' % (repr(t),) +# return output + + def print_line_with_pointer(self, pos, length=0, out=sys.stderr): + """Print the line of 'text' that includes position 'p', + along with a second line with a single caret (^) at position p""" + + file,line,p = pos + if file != self.filename: + if self.stack: return self.stack.print_line_with_pointer(pos,length=length,out=out) + print >>out, "(%s: not in input buffer)" % file + return + + text = self.input + p += length-1 # starts at pos 1 + + origline=line + line -= self.del_line + spos=0 + if line > 0: + while 1: + line = line - 1 + try: + cr = text.index("\n",spos) + except ValueError: + if line: + text = "" + break + if line == 0: + text = text[spos:cr] + break + spos = cr+1 + else: + print >>out, "(%s:%d not in input buffer)" % (file,origline) + return + + # Now try printing part of the line + text = text[max(p-80, 0):p+80] + p = p - max(p-80, 0) + + # Strip to the left + i = text[:p].rfind('\n') + j = text[:p].rfind('\r') + if i < 0 or (0 <= j < i): i = j + if 0 <= i < p: + p = p - i - 1 + text = text[i+1:] + + # Strip to the right + i = text.find('\n', p) + j = text.find('\r', p) + if i < 0 or (0 <= j < i): i = j + if i >= 0: + text = text[:i] + + # Now shorten the text + while len(text) > 70 and p > 60: + # Cut off 10 chars + text = "..." + text[10:] + p = p - 7 + + # Now print the string, along with an indicator + print >>out, '> ',text + print >>out, '> ',' '*p + '^' + + def grab_input(self): + """Get more input if possible.""" + if not self.file: return + if len(self.input) - self.pos >= MIN_WINDOW: return + + data = self.file.read(MIN_WINDOW) + if data is None or data == "": + self.file = None + + # Drop bytes from the start, if necessary. + if self.pos > 2*MIN_WINDOW: + self.del_pos += MIN_WINDOW + self.del_line += self.input[:MIN_WINDOW].count("\n") + self.pos -= MIN_WINDOW + self.input = self.input[MIN_WINDOW:] + data + else: + self.input = self.input + data + + def getchar(self): + """Return the next character.""" + self.grab_input() + + c = self.input[self.pos] + self.pos += 1 + return c + + def token(self, restrict, context=None): + """Scan for another token.""" + + while 1: + if self.stack: + try: + return self.stack.token(restrict, context) + except StopIteration: + self.stack = None + + # Keep looking for a token, ignoring any in self.ignore + self.grab_input() + + # special handling for end-of-file + if self.stacked and self.pos==len(self.input): + raise StopIteration + + # Search the patterns for the longest match, with earlier + # tokens in the list having preference + best_match = -1 + best_pat = '(error)' + best_m = None + for p, regexp in self.patterns: + # First check to see if we're ignoring this token + if restrict and p not in restrict and p not in self.ignore: + continue + m = regexp.match(self.input, self.pos) + if m and m.end()-m.start() > best_match: + # We got a match that's better than the previous one + best_pat = p + best_match = m.end()-m.start() + best_m = m + + # If we didn't find anything, raise an error + if best_pat == '(error)' and best_match < 0: + msg = 'Bad Token' + if restrict: + msg = 'Trying to find one of '+', '.join(restrict) + raise SyntaxError(self.get_pos(), msg, context=context) + + ignore = best_pat in self.ignore + value = self.input[self.pos:self.pos+best_match] + if not ignore: + tok=Token(type=best_pat, value=value, pos=self.get_pos()) + + self.pos += best_match + + npos = value.rfind("\n") + if npos > -1: + self.col = best_match-npos + self.line += value.count("\n") + else: + self.col += best_match + + # If we found something that isn't to be ignored, return it + if not ignore: + if len(self.tokens) >= 10: + del self.tokens[0] + self.tokens.append(tok) + self.last_read_token = tok + # print repr(tok) + return tok + else: + ignore = self.ignore[best_pat] + if ignore: + ignore(self, best_m) + + def peek(self, *types, **kw): + """Returns the token type for lookahead; if there are any args + then the list of args is the set of token types to allow""" + context = kw.get("context",None) + if self.last_token is None: + self.last_types = types + self.last_token = self.token(types,context) + elif self.last_types: + for t in types: + if t not in self.last_types: + raise NotImplementedError("Unimplemented: restriction set changed") + return self.last_token.type + + def scan(self, type, **kw): + """Returns the matched text, and moves to the next token""" + context = kw.get("context",None) + + if self.last_token is None: + tok = self.token([type],context) + else: + if self.last_types and type not in self.last_types: + raise NotImplementedError("Unimplemented: restriction set changed") + + tok = self.last_token + self.last_token = None + if tok.type != type: + if not self.last_types: self.last_types=[] + raise SyntaxError(tok.pos, 'Trying to find '+type+': '+ ', '.join(self.last_types)+", got "+tok.type, context=context) + return tok.value + +class Parser(object): + """Base class for Yapps-generated parsers. + + """ + + def __init__(self, scanner): + self._scanner = scanner + + def _stack(self, input="",file=None,filename=None): + """Temporarily read from someplace else""" + self._scanner.stack_input(input,file,filename) + self._tok = None + + def _peek(self, *types, **kw): + """Returns the token type for lookahead; if there are any args + then the list of args is the set of token types to allow""" + return self._scanner.peek(*types, **kw) + + def _scan(self, type, **kw): + """Returns the matched text, and moves to the next token""" + return self._scanner.scan(type, **kw) + +class Context(object): + """Class to represent the parser's call stack. + + Every rule creates a Context that links to its parent rule. The + contexts can be used for debugging. + + """ + + def __init__(self, parent, scanner, rule, args=()): + """Create a new context. + + Args: + parent: Context object or None + scanner: Scanner object + rule: string (name of the rule) + args: tuple listing parameters to the rule + + """ + self.parent = parent + self.scanner = scanner + self.rule = rule + self.args = args + while scanner.stack: scanner = scanner.stack + self.token = scanner.last_read_token + + def __str__(self): + output = '' + if self.parent: output = str(self.parent) + ' > ' + output += self.rule + return output + +def print_error(err, scanner, max_ctx=None): + """Print error messages, the parser stack, and the input text -- for human-readable error messages.""" + # NOTE: this function assumes 80 columns :-( + # Figure out the line number + pos = err.pos + if not pos: + pos = scanner.get_pos() + + file_name, line_number, column_number = pos + print >>sys.stderr, '%s:%d:%d: %s' % (file_name, line_number, column_number, err.msg) + + scanner.print_line_with_pointer(pos) + + context = err.context + token = None + while context: + print >>sys.stderr, 'while parsing %s%s:' % (context.rule, tuple(context.args)) + if context.token: + token = context.token + if token: + scanner.print_line_with_pointer(token.pos, length=len(token.value)) + context = context.parent + if max_ctx: + max_ctx = max_ctx-1 + if not max_ctx: + break + +def wrap_error_reporter(parser, rule, *args,**kw): + try: + return getattr(parser, rule)(*args,**kw) + except SyntaxError, e: + print_error(e, parser._scanner) + except NoMoreTokens: + print >>sys.stderr, 'Could not complete parsing; stopped around here:' + print >>sys.stderr, parser._scanner diff --git a/yapps2.py b/yapps2.py new file mode 100755 index 0000000..d6fd101 --- /dev/null +++ b/yapps2.py @@ -0,0 +1,113 @@ +#!/usr/bin/python + +# +# Yapps 2 - yet another python parser system +# Copyright 1999-2003 by Amit J. Patel +# +# This version of Yapps 2 can be distributed under the +# terms of the MIT open source license, either found in the LICENSE file +# included with the Yapps distribution +# or at +# +# + +import sys, re + +from yapps import runtime, parsetree + +def generate(inputfilename, outputfilename='', dump=0, **flags): + """Generate a grammar, given an input filename (X.g) + and an output filename (defaulting to X.py).""" + + if not outputfilename: + if inputfilename.endswith('.g'): + outputfilename = inputfilename[:-2] + '.py' + else: + raise Exception('Must specify output filename if input filename is not *.g') + + DIVIDER = '\n%%\n' # This pattern separates the pre/post parsers + preparser, postparser = None, None # Code before and after the parser desc + + # Read the entire file + s = open(inputfilename,'r').read() + + # See if there's a separation between the pre-parser and parser + f = s.find(DIVIDER) + if f >= 0: preparser, s = s[:f]+'\n\n', s[f+len(DIVIDER):] + + # See if there's a separation between the parser and post-parser + f = s.find(DIVIDER) + if f >= 0: s, postparser = s[:f], '\n\n'+s[f+len(DIVIDER):] + + # Create the parser and scanner and parse the text + scanner = grammar.ParserDescriptionScanner(s, filename=inputfilename) + if preparser: scanner.del_line += preparser.count('\n') + + parser = grammar.ParserDescription(scanner) + t = runtime.wrap_error_reporter(parser, 'Parser') + if t is None: return 1 # Failure + if preparser is not None: t.preparser = preparser + if postparser is not None: t.postparser = postparser + + # Check the options + for f in t.options.keys(): + for opt,_,_ in yapps_options: + if f == opt: break + else: + print >>sys.stderr, 'Warning: unrecognized option', f + # Add command line options to the set + for f in flags.keys(): t.options[f] = flags[f] + + # Generate the output + if dump: + t.dump_information() + else: + t.output = open(outputfilename, 'w') + t.generate_output() + return 0 + +if __name__ == '__main__': + import doctest + doctest.testmod(sys.modules['__main__']) + doctest.testmod(parsetree) + + # Someday I will use optparse, but Python 2.3 is too new at the moment. + yapps_options = [ + ('context-insensitive-scanner', + 'context-insensitive-scanner', + 'Scan all tokens (see docs)'), + ] + + import getopt + optlist, args = getopt.getopt(sys.argv[1:], 'f:', ['help', 'dump', 'use-devel-grammar']) + if not args or len(args) > 2: + print >>sys.stderr, 'Usage:' + print >>sys.stderr, ' python', sys.argv[0], '[flags] input.g [output.py]' + print >>sys.stderr, 'Flags:' + print >>sys.stderr, (' --dump' + ' '*40)[:35] + 'Dump out grammar information' + print >>sys.stderr, (' --use-devel-grammar' + ' '*40)[:35] + 'Use the devel grammar parser from yapps_grammar.py instead of the stable grammar from grammar.py' + for flag, _, doc in yapps_options: + print >>sys.stderr, (' -f' + flag + ' '*40)[:35] + doc + else: + # Read in the options and create a list of flags + flags = {} + use_devel_grammar = 0 + for opt in optlist: + for flag, name, _ in yapps_options: + if opt == ('-f', flag): + flags[name] = 1 + break + else: + if opt == ('--dump', ''): + flags['dump'] = 1 + elif opt == ('--use-devel-grammar', ''): + use_devel_grammar = 1 + else: + print >>sys.stderr, 'Warning: unrecognized option', opt[0], opt[1] + + if use_devel_grammar: + import yapps_grammar as grammar + else: + from yapps import grammar + + sys.exit(generate(*tuple(args), **flags)) diff --git a/yapps_grammar.g b/yapps_grammar.g new file mode 100644 index 0000000..cd21a9e --- /dev/null +++ b/yapps_grammar.g @@ -0,0 +1,121 @@ +# grammar.py, part of Yapps 2 - yet another python parser system +# Copyright 1999-2003 by Amit J. Patel +# Enhancements copyright 2003-2004 by Matthias Urlichs +# +# This version of the Yapps 2 grammar can be distributed under the +# terms of the MIT open source license, either found in the LICENSE +# file included with the Yapps distribution +# or at +# +# + +"""Parser for Yapps grammars. + +This file defines the grammar of Yapps grammars. Naturally, it is +implemented in Yapps. The grammar.py module needed by Yapps is built +by running Yapps on yapps_grammar.g. (Holy circularity, Batman!) + +""" + +import sys, re +from yapps import parsetree + +###################################################################### +def cleanup_choice(rule, lst): + if len(lst) == 0: return Sequence(rule, []) + if len(lst) == 1: return lst[0] + return parsetree.Choice(rule, *tuple(lst)) + +def cleanup_sequence(rule, lst): + if len(lst) == 1: return lst[0] + return parsetree.Sequence(rule, *tuple(lst)) + +def resolve_name(rule, tokens, id, args): + if id in [x[0] for x in tokens]: + # It's a token + if args: + print 'Warning: ignoring parameters on TOKEN %s<<%s>>' % (id, args) + return parsetree.Terminal(rule, id) + else: + # It's a name, so assume it's a nonterminal + return parsetree.NonTerminal(rule, id, args) + +%% +parser ParserDescription: + + ignore: "[ \t\r\n]+" + ignore: "#.*?\r?\n" + token EOF: "$" + token ATTR: "<<.+?>>" + token STMT: "{{.+?}}" + token ID: '[a-zA-Z_][a-zA-Z_0-9]*' + token STR: '[rR]?\'([^\\n\'\\\\]|\\\\.)*\'|[rR]?"([^\\n"\\\\]|\\\\.)*"' + token LP: '\\(' + token RP: '\\)' + token LB: '\\[' + token RB: '\\]' + token OR: '[|]' + token STAR: '[*]' + token PLUS: '[+]' + token QUEST: '[?]' + token COLON: ':' + + rule Parser: "parser" ID ":" + Options + Tokens + Rules<> + EOF + {{ return parsetree.Generator(ID,Options,Tokens,Rules) }} + + rule Options: {{ opt = {} }} + ( "option" ":" Str {{ opt[Str] = 1 }} )* + {{ return opt }} + + rule Tokens: {{ tok = [] }} + ( + "token" ID ":" Str {{ tok.append( (ID,Str) ) }} + | "ignore" + ":" Str {{ ign = ('#ignore',Str) }} + ( STMT {{ ign = ign + (STMT[2:-2],) }} )? + {{ tok.append( ign ) }} + )* + {{ return tok }} + + rule Rules<>: + {{ rul = [] }} + ( + "rule" ID OptParam ":" ClauseA<> + {{ rul.append( (ID, OptParam, ClauseA) ) }} + )* + {{ return rul }} + + rule ClauseA<>: + ClauseB<> + {{ v = [ClauseB] }} + ( OR ClauseB<> {{ v.append(ClauseB) }} )* + {{ return cleanup_choice(rule,v) }} + + rule ClauseB<>: + {{ v = [] }} + ( ClauseC<> {{ v.append(ClauseC) }} )* + {{ return cleanup_sequence(rule, v) }} + + rule ClauseC<>: + ClauseD<> + ( PLUS {{ return parsetree.Plus(rule, ClauseD) }} + | STAR {{ return parsetree.Star(rule, ClauseD) }} + | QUEST {{ return parsetree.Option(rule, ClauseD) }} + | {{ return ClauseD }} ) + + rule ClauseD<>: + STR {{ t = (STR, eval(STR,{},{})) }} + {{ if t not in tokens: tokens.insert( 0, t ) }} + {{ return parsetree.Terminal(rule, STR) }} + | ID OptParam {{ return resolve_name(rule,tokens, ID, OptParam) }} + | LP ClauseA<> RP {{ return ClauseA }} + | LB ClauseA<> RB {{ return parsetree.Option(rule, ClauseA) }} + | STMT {{ return parsetree.Eval(rule, STMT[2:-2]) }} + + rule OptParam: [ ATTR {{ return ATTR[2:-2] }} ] {{ return '' }} + rule Str: STR {{ return eval(STR,{},{}) }} +%% -- cgit