diff options
author | sienkiew <sienkiew@d34015c8-bcbb-4646-8ac8-8ba5febf221d> | 2011-07-21 11:17:58 -0400 |
---|---|---|
committer | sienkiew <sienkiew@d34015c8-bcbb-4646-8ac8-8ba5febf221d> | 2011-07-21 11:17:58 -0400 |
commit | be5a70d3aa1c30d7c86d77649b747de2838566ce (patch) | |
tree | 7c27103a4d37b61f5dba748672f0685536e667d0 | |
parent | 77ce1e78848ba9eead2566e3bc55523aab4547e8 (diff) | |
download | exyapps-be5a70d3aa1c30d7c86d77649b747de2838566ce.tar.gz |
initial import of yapps from debian sources
git-svn-id: http://svn.stsci.edu/svn/ssb/etal/exyapps/trunk@356 d34015c8-bcbb-4646-8ac8-8ba5febf221d
-rw-r--r-- | LICENSE | 20 | ||||
-rw-r--r-- | changelog | 983 | ||||
-rw-r--r-- | doc/yapps2.haux | 31 | ||||
-rw-r--r-- | doc/yapps2.html | 1206 | ||||
-rw-r--r-- | doc/yapps2.htoc | 36 | ||||
-rw-r--r-- | doc/yapps2.tex | 1246 | ||||
-rw-r--r-- | examples/calc.g | 64 | ||||
-rw-r--r-- | examples/expr.g | 21 | ||||
-rw-r--r-- | examples/lisp.g | 13 | ||||
-rw-r--r-- | examples/notes | 44 | ||||
-rw-r--r-- | examples/xml.g | 66 | ||||
-rw-r--r-- | setup.py | 42 | ||||
-rw-r--r-- | test/empty_clauses.g | 10 | ||||
-rw-r--r-- | test/line_numbers.g | 10 | ||||
-rw-r--r-- | test/option.g | 17 | ||||
-rw-r--r-- | yapps/__init__.py | 1 | ||||
-rw-r--r-- | yapps/grammar.py | 211 | ||||
-rw-r--r-- | yapps/parsetree.py | 673 | ||||
-rw-r--r-- | yapps/runtime.py | 442 | ||||
-rwxr-xr-x | yapps2.py | 113 | ||||
-rw-r--r-- | yapps_grammar.g | 121 |
21 files changed, 5370 insertions, 0 deletions
@@ -0,0 +1,20 @@ +<http://www.opensource.org/licenses/mit-license.php> + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/changelog b/changelog new file mode 100644 index 0000000..4fdf12e --- /dev/null +++ b/changelog @@ -0,0 +1,983 @@ +ChangeSet + 1.38 05/01/22 19:36:32 smurf@smurf.noris.de +2 -0 + Add option to limit backtrace depth on syntax errors. + + yapps/runtime.py + 1.15 05/01/22 19:36:31 smurf@smurf.noris.de +5 -1 + Add option to limit backtrace depth on syntax errors. + + debian/changelog + 1.20 05/01/22 19:36:31 smurf@smurf.noris.de +2 -1 + Add option to limit backtrace depth on syntax errors. + +ChangeSet + 1.37 05/01/22 03:39:56 smurf@smurf.noris.de +2 -0 + Fix recursive includes. + + yapps/runtime.py + 1.14 05/01/22 03:39:54 smurf@smurf.noris.de +395 -381 + Fix recursive includes. + + debian/changelog + 1.19 05/01/22 03:39:54 smurf@smurf.noris.de +6 -0 + Fix recursive includes. + +ChangeSet + 1.36 04/12/23 23:49:52 smurf@smurf.noris.de +1 -0 + Brown paper bag -- fix Python 2.4 stuff. + + debian/changelog + 1.18 04/12/23 23:49:52 smurf@smurf.noris.de +6 -0 + Brown paper bag -- fix Python 2.4 stuff. + +ChangeSet + 1.35 04/12/23 21:00:34 smurf@smurf.noris.de +1 -0 + typo + + debian/control + 1.10 04/12/23 21:00:33 smurf@smurf.noris.de +1 -1 + typo + +ChangeSet + 1.34 04/12/12 20:22:54 smurf@smurf.noris.de +2 -0 + Add support for Python 2.4 + + debian/control + 1.9 04/12/12 20:22:52 smurf@smurf.noris.de +1 -1 + Add support for Python 2.4 + + debian/changelog + 1.17 04/12/12 20:22:52 smurf@smurf.noris.de +6 -0 + Add support for Python 2.4 + +ChangeSet + 1.33 04/09/23 11:24:16 smurf@smurf.noris.de +3 -0 + update documentation: + - toss hyphens + - document extensions + + doc/yapps2.tex + 1.3 04/09/23 11:24:16 smurf@smurf.noris.de +21 -0 + add a Debian Extensions section + + debian/yapps.1 + 1.2 04/09/23 11:24:16 smurf@smurf.noris.de +14 -9 + escape more hyphens (i.e., all the rest) + + debian/changelog + 1.16 04/09/23 11:24:16 smurf@smurf.noris.de +2 -0 + update documentation: + - toss hyphens + - document extensions + +ChangeSet + 1.32 04/09/23 11:23:24 smurf@smurf.noris.de +2 -0 + turn off triggers + + BitKeeper/triggers/pre-commit.upversion + 1.2 04/09/23 11:23:24 smurf@smurf.noris.de +2 -0 + off + + BitKeeper/triggers/post-commit.changelog + 1.2 04/09/23 11:23:24 smurf@smurf.noris.de +2 -0 + off + +ChangeSet + 1.31 04/09/23 10:55:24 smurf@smurf.noris.de +1 -0 + ignore new package's files + + BitKeeper/etc/ignore + 1.17 04/09/23 10:55:23 smurf@smurf.noris.de +1 -0 + added debian/yapps2-runtime/* + + debian/yapps2-runtime.README + 1.1 04/09/23 10:50:33 smurf@smurf.noris.de +11 -0 + +ChangeSet + 1.30 04/09/23 10:50:33 smurf@smurf.noris.de +8 -0 + split off runtime to its own package + document the fact that I can't use the original runtime + + debian/yapps2-runtime.dirs + 1.6 04/09/23 10:50:33 smurf@smurf.noris.de +2 -4 + split off runtime + + debian/yapps2-runtime.README + 1.0 04/09/23 10:50:33 smurf@smurf.noris.de +0 -0 + BitKeeper file /daten/src/debian/python_yapps/debian/yapps2-runtime.README + + debian/rules + 1.5 04/09/23 10:50:33 smurf@smurf.noris.de +4 -1 + move runtime files to their own package + + debian/control + 1.8 04/09/23 10:50:33 smurf@smurf.noris.de +14 -1 + split off runtime to its own package + + debian/changelog + 1.15 04/09/23 10:50:33 smurf@smurf.noris.de +9 -0 + document package split + + debian/README + 1.2 04/09/23 10:50:33 smurf@smurf.noris.de +21 -4 + Updated for package split + + debian/yapps2.docs + 1.3 04/09/23 10:31:15 smurf@smurf.noris.de +0 -0 + Rename: debian/docs -> debian/yapps2.docs + + debian/yapps2-runtime.dirs + 1.5 04/09/23 10:30:48 smurf@smurf.noris.de +0 -0 + bk cp yapps2.dirs yapps2-runtime.dirs + + debian/yapps2.dirs + 1.4 04/09/23 10:30:42 smurf@smurf.noris.de +0 -0 + Rename: debian/dirs -> debian/yapps2.dirs + + debian/yapps2.dirs + 1.4 04/09/23 10:30:42 smurf@smurf.noris.de +0 -0 + Rename: debian/dirs -> debian/yapps2.dirs + +ChangeSet + 1.29 04/07/19 09:30:22 smurf@smurf.noris.de +5 -0 + latex2html => hevea + + debian/yapps2.doc-base + 1.2 04/07/19 09:30:21 smurf@smurf.noris.de +2 -2 + latex2html => hevea + + debian/rules + 1.4 04/07/19 09:30:21 smurf@smurf.noris.de +4 -2 + latex2html => hevea + + debian/control + 1.7 04/07/19 09:30:21 smurf@smurf.noris.de +1 -1 + latex2html => hevea + + debian/changelog + 1.14 04/07/19 09:30:21 smurf@smurf.noris.de +6 -0 + latex2html => hevea + + BitKeeper/etc/ignore + 1.16 04/07/19 09:30:06 smurf@smurf.noris.de +1 -0 + added doc/yapps2.haux + + BitKeeper/etc/ignore + 1.15 04/07/19 09:29:55 smurf@smurf.noris.de +1 -0 + added doc/yapps2.ht* + +ChangeSet + 1.28 04/07/12 09:35:59 smurf@smurf.noris.de +2 -0 + Build-Depend on python. + + debian/control + 1.6 04/07/12 09:35:58 smurf@smurf.noris.de +1 -1 + Build-Depend on python. + + debian/changelog + 1.13 04/07/12 09:35:58 smurf@smurf.noris.de +6 -0 + doc + +ChangeSet + 1.27 04/05/16 22:02:40 smurf@smurf.noris.de +2 -0 + ship "empty" file + + yapps/__init__.py + 1.2 04/05/16 22:02:39 smurf@smurf.noris.de +1 -0 + ship "empty" file + + debian/changelog + 1.12 04/05/16 22:02:39 smurf@smurf.noris.de +2 -1 + doc + +ChangeSet + 1.26 04/05/16 22:01:42 smurf@smurf.noris.de +2 -0 + Typo (made large file handling slow) + + yapps/runtime.py + 1.13 04/05/16 22:01:42 smurf@smurf.noris.de +1 -1 + Typo + + debian/changelog + 1.11 04/05/16 22:01:42 smurf@smurf.noris.de +6 -0 + Version 2.1.1-11 + +ChangeSet + 1.25 04/05/14 12:25:51 smurf@smurf.noris.de +1 -0 + exporter: test was in wrong dir + + debian/exporter + 1.3 04/05/14 12:25:51 smurf@smurf.noris.de +1 -0 + wrong dir + +ChangeSet + 1.24 04/05/14 12:20:04 smurf@smurf.noris.de +1 -0 + Clean up external source before generating a diff + + debian/exporter + 1.2 04/05/14 12:20:04 smurf@smurf.noris.de +7 -0 + Clean up external source before generating a diff + +ChangeSet + 1.23 04/05/14 12:14:34 smurf@linux.smurf.noris.de +13 -0 + Documentation update: + build and install HTML documentation from LaTex source + + debian/changelog + 1.10 04/05/14 12:14:33 smurf@linux.smurf.noris.de +7 -0 + Version 2.1.1-10 + + debian/yapps2.doc-base + 1.1 04/05/14 12:14:32 smurf@smurf.noris.de +13 -0 + + yapps_grammar.g + 1.5 04/05/14 12:14:31 smurf@smurf.noris.de +1 -0 + add my copyright notice + + yapps/runtime.py + 1.12 04/05/14 12:14:31 smurf@smurf.noris.de +1 -0 + add my copyright notice + + debian/yapps2.doc-base + 1.0 04/05/14 12:14:31 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/yapps2.doc-base + + debian/rules + 1.3 04/05/14 12:14:31 smurf@smurf.noris.de +5 -5 + gernerate and install html documentation + don't install LICENSE file + + debian/yapps2-runtime.dirs + 1.3 04/05/14 12:14:30 smurf@smurf.noris.de +1 -2 + drop overrides + add doc dir +html + + debian/docs + 1.2 04/05/14 12:14:30 smurf@smurf.noris.de +1 -1 + install latex documentation + + debian/dirs + 1.3 04/05/14 12:14:30 smurf@smurf.noris.de +1 -2 + drop overrides + add doc dir +html + + debian/copyright + 1.3 04/05/14 12:14:30 smurf@smurf.noris.de +21 -3 + include license here instead of installing a LICENSE file + + debian/control + 1.5 04/05/14 12:14:30 smurf@smurf.noris.de +6 -6 + Dep on latex2html + indent list + + BitKeeper/etc/ignore + 1.14 04/05/14 12:06:12 smurf@smurf.noris.de +1 -0 + added doc/yapps2/* + + BitKeeper/etc/ignore + 1.13 04/05/14 12:06:07 smurf@smurf.noris.de +3 -0 + added debian/yapps2/* debian/*.substvars debian/*.debhelper + + BitKeeper/deleted/.del-overrides.lintian~19711613dc4ce90f + 1.3 04/05/14 11:51:33 smurf@smurf.noris.de +0 -0 + Delete: debian/overrides.lintian + + BitKeeper/deleted/.del-overrides.linda~b0c6fa08da170a16 + 1.2 04/05/14 11:51:33 smurf@smurf.noris.de +0 -0 + Delete: debian/overrides.linda + + doc/yapps2.tex + 1.2 04/05/14 11:34:34 smurf@smurf.noris.de +0 -0 + Rename: yapps2.tex -> doc/yapps2.tex + +ChangeSet + 1.22 04/05/14 11:33:27 smurf@smurf.noris.de +1 -0 + Merge bk://server/public/python_yapps + into smurf.noris.de:/usr/local/src/misc/yapps + + BitKeeper/deleted/.del-logging_ok~530b65bc14e5cc7c + 1.2 04/05/14 11:33:26 smurf@smurf.noris.de +0 -0 + 'Auto converge rename' + + BitKeeper/etc/logging_ok + 1.1 04/05/14 11:33:13 smurf@smurf.noris.de +1 -0 + +ChangeSet + 1.4.1.1 04/05/14 11:33:13 smurf@linux.smurf.noris.de +2 -0 + Added tex documentation from yapps-2.0.4. + + BitKeeper/etc/logging_ok + 1.0 04/05/14 11:33:13 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/BitKeeper/etc/logging_ok + + yapps2.tex + 1.1 04/05/14 11:33:10 smurf@smurf.noris.de +1225 -0 + + yapps2.tex + 1.0 04/05/14 11:33:10 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/yapps2.tex + +ChangeSet + 1.21 04/05/14 11:31:18 smurf@linux.smurf.noris.de +7 -0 + Renamed the package to "yapps2". + + setup.py + 1.2 04/05/14 11:31:17 smurf@smurf.noris.de +17 -2 + Fixed name + Updated long description + + debian/yapps2-runtime.dirs + 1.2 04/05/14 11:31:17 smurf@smurf.noris.de +1 -1 + rename: python-yapps => yapps2 + + debian/rules + 1.2 04/05/14 11:31:17 smurf@smurf.noris.de +8 -8 + rename: python-yapps => yapps2 + + debian/overrides.lintian + 1.2 04/05/14 11:31:17 smurf@smurf.noris.de +1 -1 + rename: python-yapps => yapps2 + + debian/dirs + 1.2 04/05/14 11:31:17 smurf@smurf.noris.de +1 -1 + rename: python-yapps => yapps2 + + debian/copyright + 1.2 04/05/14 11:31:16 smurf@smurf.noris.de +11 -3 + Added pointer to original source + + debian/control + 1.4 04/05/14 11:31:16 smurf@smurf.noris.de +18 -10 + rename: python-yapps => yapps2 + + debian/changelog + 1.9 04/05/14 11:31:16 smurf@smurf.noris.de +13 -13 + Cleanup + +ChangeSet + 1.20 03/12/31 14:00:42 smurf@linux.smurf.noris.de +2 -0 + require python-dev because of distutils + + debian/changelog + 1.8 03/12/31 14:00:42 smurf@linux.smurf.noris.de +6 -0 + Version 2.1.1-8 + + debian/control + 1.3 03/12/31 14:00:40 smurf@smurf.noris.de +1 -1 + require python-dev because of distutils + +ChangeSet + 1.19 03/12/31 13:57:38 smurf@linux.smurf.noris.de +2 -0 + Change yapps.py t exit 1 on failure to parse + + debian/changelog + 1.7 03/12/31 13:57:38 smurf@linux.smurf.noris.de +6 -0 + Version 2.1.1-7 + + yapps2.py + 1.6 03/12/31 13:57:37 smurf@smurf.noris.de +3 -2 + exit 1 on error + +ChangeSet + 1.18 03/12/30 15:36:56 smurf@linux.smurf.noris.de +2 -0 + Update to 3.6.1, use build-depends-indep. + + debian/changelog + 1.6 03/12/30 15:36:56 smurf@linux.smurf.noris.de +6 -0 + Version 2.1.1-6 + + debian/control + 1.2 03/12/30 15:36:55 smurf@smurf.noris.de +2 -2 + Update to 3.6.1, use build-depends-indep. + +ChangeSet + 1.17 03/12/30 15:33:19 smurf@linux.smurf.noris.de +2 -0 + Add some notes. + + debian/changelog + 1.5 03/12/30 15:33:19 smurf@linux.smurf.noris.de +6 -0 + Version 2.1.1-5 + + examples/notes + 1.1 03/12/30 15:33:18 smurf@smurf.noris.de +44 -0 + + examples/notes + 1.0 03/12/30 15:33:17 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/examples/notes + +ChangeSet + 1.16 03/12/30 15:30:05 smurf@linux.smurf.noris.de +2 -0 + Correctly report syntax errors without line number + + debian/changelog + 1.4 03/12/30 15:30:05 smurf@linux.smurf.noris.de +6 -0 + Version 2.1.1-4 + + yapps/runtime.py + 1.11 03/12/30 15:30:04 smurf@smurf.noris.de +6 -2 + Report syntax errors with no line number + +ChangeSet + 1.15 03/12/30 14:02:37 smurf@linux.smurf.noris.de +2 -0 + Repair ignored-pattern upcall. + + debian/changelog + 1.3 03/12/30 14:02:37 smurf@linux.smurf.noris.de +6 -0 + Version 2.1.1-3 + + yapps/runtime.py + 1.10 03/12/30 14:02:36 smurf@smurf.noris.de +4 -2 + Repair ignore upcall. + +ChangeSet + 1.14 03/12/30 13:30:14 smurf@linux.smurf.noris.de +2 -0 + runtime: fix error reporting + + debian/changelog + 1.2 03/12/30 13:30:14 smurf@linux.smurf.noris.de +6 -0 + Version 2.1.1-2 + + yapps/runtime.py + 1.9 03/12/30 13:30:12 smurf@smurf.noris.de +9 -9 + Fix error reporting + +ChangeSet + 1.13 03/12/30 12:25:29 smurf@linux.smurf.noris.de +2 -0 + replace runtime grammar + yapps_grammar.g: delete shebang line, fix imports + + yapps_grammar.g + 1.4 03/12/30 12:25:28 smurf@smurf.noris.de +1 -3 + fix import + delete shebang line + + yapps/grammar.py + 1.11 03/12/30 12:25:28 smurf@smurf.noris.de +49 -66 + replace runtime grammar + +ChangeSet + 1.12 03/12/30 11:51:26 smurf@linux.smurf.noris.de +19 -0 + D + + setup.py + 1.1 03/12/30 11:51:25 smurf@smurf.noris.de +27 -0 + + debian/yapps.1 + 1.1 03/12/30 11:51:25 smurf@smurf.noris.de +58 -0 + + debian/rules + 1.1 03/12/30 11:51:25 smurf@smurf.noris.de +91 -0 + + debian/overrides.lintian + 1.1 03/12/30 11:51:25 smurf@smurf.noris.de +1 -0 + + setup.py + 1.0 03/12/30 11:51:25 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/setup.py + + debian/yapps2-runtime.dirs + 1.1 03/12/30 11:51:24 smurf@smurf.noris.de +5 -0 + + debian/yapps.1 + 1.0 03/12/30 11:51:25 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/yapps.1 + + debian/rules + 1.0 03/12/30 11:51:25 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/rules + + debian/overrides.lintian + 1.0 03/12/30 11:51:25 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/overrides.lintian + + debian/overrides.linda + 1.1 03/12/30 11:51:24 smurf@smurf.noris.de +3 -0 + + debian/exporter + 1.1 03/12/30 11:51:24 smurf@smurf.noris.de +10 -0 + + debian/docs + 1.1 03/12/30 11:51:24 smurf@smurf.noris.de +3 -0 + + debian/dirs + 1.1 03/12/30 11:51:24 smurf@smurf.noris.de +5 -0 + + debian/copyright + 1.1 03/12/30 11:51:24 smurf@smurf.noris.de +15 -0 + + debian/control + 1.1 03/12/30 11:51:24 smurf@smurf.noris.de +19 -0 + + debian/compat + 1.1 03/12/30 11:51:24 smurf@smurf.noris.de +1 -0 + + debian/README + 1.1 03/12/30 11:51:24 smurf@smurf.noris.de +6 -0 + + yapps_grammar.g + 1.3 03/12/30 11:51:24 smurf@smurf.noris.de +0 -1 + Make the scanner context-sensitive. Works better. + + yapps2.py + 1.5 03/12/30 11:51:24 smurf@smurf.noris.de +1 -1 + Fix path + + yapps/runtime.py + 1.8 03/12/30 11:51:24 smurf@smurf.noris.de +0 -1 + Regularize header + + yapps/parsetree.py + 1.7 03/12/30 11:51:24 smurf@smurf.noris.de +0 -2 + Drop shebang line, this is not a program. + + yapps/grammar.py + 1.10 03/12/30 11:51:24 smurf@smurf.noris.de +0 -2 + Drop shebang line, this is not a program. + + debian/yapps2-runtime.dirs + 1.0 03/12/30 11:51:24 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/dirs + + debian/overrides.linda + 1.0 03/12/30 11:51:24 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/overrides.linda + + debian/exporter + 1.0 03/12/30 11:51:24 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/exporter + + debian/docs + 1.0 03/12/30 11:51:24 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/docs + + debian/dirs + 1.0 03/12/30 11:51:24 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/dirs + + debian/copyright + 1.0 03/12/30 11:51:24 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/copyright + + debian/control + 1.0 03/12/30 11:51:24 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/control + + debian/compat + 1.0 03/12/30 11:51:24 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/compat + + debian/README + 1.0 03/12/30 11:51:24 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/README + + debian/changelog + 1.1 03/12/30 11:41:14 smurf@smurf.noris.de +15 -0 + + debian/changelog + 1.0 03/12/30 11:41:14 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/debian/changelog + + BitKeeper/etc/ignore + 1.12 03/12/30 11:40:56 smurf@smurf.noris.de +1 -0 + added changelog + +ChangeSet + 1.11 03/12/30 11:23:09 smurf@linux.smurf.noris.de +5 -0 + Rewrote imports et al. to create+use a real "yapps" module + + yapps/__init__.py + 1.1 03/12/30 11:23:08 smurf@smurf.noris.de +0 -0 + + yapps2.py + 1.4 03/12/30 11:23:08 smurf@smurf.noris.de +3 -3 + Refactor to use reasonable "yapps" module + + yapps/runtime.py + 1.7 03/12/30 11:23:08 smurf@smurf.noris.de +0 -0 + Rename: yappsrt.py -> yapps/runtime.py + + yapps/parsetree.py + 1.6 03/12/30 11:23:08 smurf@smurf.noris.de +10 -10 + Refactor to use reasonable "yapps" module + Rename: parsetree.py -> yapps/parsetree.py + + yapps/__init__.py + 1.0 03/12/30 11:23:08 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/yapps/__init__.py + + yapps/grammar.py + 1.9 03/12/30 11:23:07 smurf@smurf.noris.de +16 -16 + Refactor to use reasonable "yapps" module + Rename: grammar.py -> yapps/grammar.py + + BitKeeper/etc/ignore + 1.11 03/12/30 11:22:15 smurf@smurf.noris.de +4 -0 + added build/* debian/python-yapps/* debian/*.debhelper debian/*.substvars + +ChangeSet + 1.10 03/12/29 22:10:59 smurf@linux.smurf.noris.de +1 -0 + Added context-insensitive-scanner end test + to a couple of composite grammar statements. + + It's probably overkill to do that with all statements..? + + parsetree.py + 1.5 03/12/29 22:10:58 smurf@smurf.noris.de +19 -5 + Added context-insensitive-scanner end test + to a couple of composite statements. + + It's probably overkill to do that with all statements..? + +ChangeSet + 1.9 03/12/29 22:05:00 smurf@linux.smurf.noris.de +1 -0 + yappsrt.py: Bugfix: stored token type + + yappsrt.py + 1.6 03/12/29 22:04:59 smurf@smurf.noris.de +1 -1 + Bugfix: stored token type + +ChangeSet + 1.8 03/12/29 21:42:41 smurf@linux.smurf.noris.de +4 -0 + Pass init arguments to the scanner + Simplify stuff a bit + + yappsrt.py + 1.5 03/12/29 21:42:40 smurf@smurf.noris.de +16 -8 + Fix line counting + simplify pattern length determination + + yapps2.py + 1.3 03/12/29 21:42:40 smurf@smurf.noris.de +3 -2 + Pass filename to scanner + + parsetree.py + 1.4 03/12/29 21:42:40 smurf@smurf.noris.de +2 -2 + Pass init arguments to the scanner + + grammar.py + 1.8 03/12/29 21:42:40 smurf@smurf.noris.de +3 -2 + Update for changed yapps2.py + +ChangeSet + 1.7 03/12/29 20:37:55 smurf@linux.smurf.noris.de +6 -0 + Cleanup ignored-symbol commands + Fix including and error reporting + + yappsrt.py + 1.4 03/12/29 20:37:54 smurf@smurf.noris.de +88 -52 + cleanup ignored symbol handling + refactor _scan and _peek: move to Scanner + generate pseudo filenames for inline documents + accept context for error handling + + yapps_grammar.g + 1.2 03/12/29 20:37:54 smurf@smurf.noris.de +4 -1 + Cleanup statements for ignored symbols + + yapps2.py + 1.2 03/12/29 20:37:54 smurf@smurf.noris.de +1 -1 + Setup line numbers correctly + + parsetree.py + 1.3 03/12/29 20:37:54 smurf@smurf.noris.de +22 -6 + Ignored-symbol handling extended + Pass context to scanners + + grammar.py + 1.7 03/12/29 20:37:54 smurf@smurf.noris.de +1 -1 + Use a hash for ignored stuff + + examples/calc.g + 1.3 03/12/29 20:37:54 smurf@smurf.noris.de +7 -4 + Cleanup include handling: use an ignored token + +ChangeSet + 1.6 03/12/29 18:16:19 smurf@linux.smurf.noris.de +6 -0 + Reproduce current grammar file + + One problem with attribute remains. + + yapps_grammar.g + 1.1 03/12/29 18:16:18 smurf@smurf.noris.de +120 -0 + + yappsrt.py + 1.3 03/12/29 18:16:18 smurf@smurf.noris.de +5 -5 + charpos => pos + + yapps_grammar.g + 1.0 03/12/29 18:16:18 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/yapps_grammar.g + + BitKeeper/etc/ignore + 1.10 03/12/29 18:15:23 smurf@smurf.noris.de +1 -0 + added *.swp + + BitKeeper/etc/ignore + 1.9 03/12/29 18:15:02 smurf@smurf.noris.de +1 -0 + added yapps_grammar.py + + BitKeeper/deleted/.del-yapps_grammar.py~276aa227aa238250 + 1.4 03/12/29 18:14:35 smurf@smurf.noris.de +0 -0 + Delete: yapps_grammar.py + + grammar.py + 1.6 03/12/29 18:14:17 smurf@smurf.noris.de +0 -0 + Rename: BitKeeper/deleted/.del-grammar.py~46b22024b3b85127 -> grammar.py + + BitKeeper/deleted/.del-grammar.py~46b22024b3b85127 + 1.5 03/12/29 18:13:11 smurf@smurf.noris.de +0 -0 + Delete: grammar.py + + yapps_grammar.py + 1.3 03/12/29 18:13:04 smurf@smurf.noris.de +0 -0 + Rename: BitKeeper/deleted/.del-yapps_grammar.py~276aa227aa238250 -> yapps_grammar.py + + grammar.py + 1.4 03/12/29 18:12:51 smurf@smurf.noris.de +0 -0 + Rename: BitKeeper/deleted/.del-grammar.py~46b22024b3b85127 -> grammar.py + + BitKeeper/deleted/.del-grammar.py~46b22024b3b85127 + 1.3 03/12/29 18:12:30 smurf@smurf.noris.de +19 -20 + Delete: grammar.py + + BitKeeper/etc/ignore + 1.8 03/12/29 17:15:10 smurf@smurf.noris.de +3 -0 + added *-stamp debian/files debian/tmp/* + + BitKeeper/etc/ignore + 1.7 03/12/29 17:15:08 smurf@smurf.noris.de +3 -0 + added *-stamp debian/files debian/tmp/* + + BitKeeper/etc/ignore + 1.6 03/12/29 17:14:00 smurf@smurf.noris.de +3 -0 + added *-stamp debian/files debian/tmp/* + + BitKeeper/triggers/post-commit.changelog + 1.1 03/12/29 17:13:58 smurf@smurf.noris.de +3 -0 + + BitKeeper/triggers/post-commit.changelog + 1.0 03/12/29 17:13:58 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/BitKeeper/triggers/post-commit.changelog + + BitKeeper/etc/logging_ok + 1.1 03/12/29 17:13:41 smurf@smurf.noris.de +1 -0 + +ChangeSet + 1.5 03/12/29 17:13:41 smurf@linux.smurf.noris.de +7 -0 + Major enhancements: + - Use a Token object + - Allow incremental reading from a file + - Allow stacking of inputs (#include, whatever) + - Remember line numbers + - Refactor print_line_with_error into the Scanner object + + BitKeeper/etc/logging_ok + 1.0 03/12/29 17:13:41 smurf@smurf.noris.de +0 -0 + BitKeeper file /usr/local/src/misc/yapps/BitKeeper/etc/logging_ok + + yappsrt.py + 1.2 03/12/29 17:13:38 smurf@smurf.noris.de +219 -141 + Major enhancements: + - Use a Token object + - Allow incremental reading from a file + - Allow stacking of inputs (#include, whatever) + - Remember line numbers + - Refactor print_line_with_error into the Scanner object + + parsetree.py + 1.2 03/12/29 17:13:38 smurf@smurf.noris.de +2 -2 + don't pass pos explicitly + + grammar.py + 1.2 03/12/29 17:13:38 smurf@smurf.noris.de +15 -19 + Cleanup + + generated from non-available file! + + examples/calc.g + 1.2 03/12/29 17:13:38 smurf@smurf.noris.de +5 -2 + cleanup (strip, atoi) + + allow reading in of expressions via stacking (TEST) + + BitKeeper/etc/ignore + 1.5 03/12/29 17:10:56 smurf@smurf.noris.de +1 -0 + added *.pyc + + BitKeeper/etc/ignore + 1.4 03/12/29 17:10:51 smurf@smurf.noris.de +1 -0 + added test/*.py + + BitKeeper/etc/ignore + 1.3 03/12/29 17:10:46 smurf@smurf.noris.de +1 -0 + added examples/*.py + + BitKeeper/deleted/.del-yapps_grammar.py~276aa227aa238250 + 1.2 03/12/29 13:58:49 smurf@smurf.noris.de +0 -0 + Delete: yapps_grammar.py + +ChangeSet + 1.4 03/08/28 00:22:57 ?@smurf.noris.de +15 -0 + Version 2.1.1 + +ChangeSet + 1.3 03/08/28 00:22:57 ?@smurf.noris.de +1 -0 + CVS-Ignore + +ChangeSet + 1.2 03/12/29 12:56:52 smurf@smurf.noris.de +1 -0 + Versionsnummern-Updateskript + + BitKeeper/etc/ignore + 1.2 03/08/28 00:22:57 ?@smurf.noris.de +3 -0 + added CVS .cvsignore CVSROOT + +ChangeSet + 1.1 03/12/29 12:56:52 smurf@smurf.noris.de +2 -0 + Initial repository create + + BitKeeper/triggers/pre-commit.upversion + 1.1 03/12/29 12:56:52 smurf@smurf.noris.de +3 -0 + + BitKeeper/etc/ignore + 1.1 03/12/29 12:56:52 smurf@smurf.noris.de +2 -0 + + BitKeeper/etc/config + 1.1 03/12/29 12:56:52 smurf@smurf.noris.de +13 -0 + +ChangeSet + 1.0 03/12/29 12:56:52 smurf@smurf.noris.de +0 -0 + BitKeeper file /tmp/b.s.20059/ChangeSet + + BitKeeper/triggers/pre-commit.upversion + 1.0 03/12/29 12:56:52 smurf@smurf.noris.de +0 -0 + BitKeeper file /tmp/b.s.20059/BitKeeper/triggers/pre-commit.upversion + + BitKeeper/etc/ignore + 1.0 03/12/29 12:56:52 smurf@smurf.noris.de +0 -0 + BitKeeper file /tmp/b.s.20059/BitKeeper/etc/ignore + + BitKeeper/etc/config + 1.0 03/12/29 12:56:52 smurf@smurf.noris.de +0 -0 + BitKeeper file /tmp/b.s.20059/BitKeeper/etc/config + + yappsrt.py + 1.1 03/08/27 23:12:19 ?@smurf.noris.de +296 -0 + New:Version 2.1.1 + + yapps_grammar.py + 1.1 03/08/28 00:22:32 ?@smurf.noris.de +234 -0 + New:Version 2.1.1 + + yapps2.py + 1.1 03/08/12 20:25:55 ?@smurf.noris.de +111 -0 + New:Version 2.1.1 + + test/option.g + 1.1 03/08/11 20:43:22 ?@smurf.noris.de +17 -0 + New:Version 2.1.1 + + test/line_numbers.g + 1.1 03/08/28 00:22:56 ?@smurf.noris.de +10 -0 + New:Version 2.1.1 + + test/empty_clauses.g + 1.1 03/08/27 23:48:11 ?@smurf.noris.de +10 -0 + New:Version 2.1.1 + + parsetree.py + 1.1 03/08/28 00:18:14 ?@smurf.noris.de +645 -0 + New:Version 2.1.1 + + grammar.py + 1.1 03/08/28 00:16:28 ?@smurf.noris.de +234 -0 + New:Version 2.1.1 + + examples/xml.g + 1.1 03/08/27 20:53:39 ?@smurf.noris.de +66 -0 + New:Version 2.1.1 + + examples/lisp.g + 1.1 03/08/11 20:18:18 ?@smurf.noris.de +13 -0 + New:Version 2.1.1 + + examples/expr.g + 1.1 03/08/08 06:47:58 ?@smurf.noris.de +21 -0 + New:Version 2.1.1 + + examples/calc.g + 1.1 03/08/11 20:17:09 ?@smurf.noris.de +58 -0 + New:Version 2.1.1 + + NOTES + 1.1 03/08/12 18:59:41 ?@smurf.noris.de +78 -0 + New:Version 2.1.1 + + LICENSE + 1.1 03/08/11 19:41:27 ?@smurf.noris.de +20 -0 + New:Version 2.1.1 + + ChangeLog + 1.1 03/08/28 00:22:16 ?@smurf.noris.de +108 -0 + New:Version 2.1.1 + + yappsrt.py + 1.0 03/08/27 23:12:19 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/yappsrt.py + + yapps_grammar.py + 1.0 03/08/28 00:22:32 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/yapps_grammar.py + + yapps2.py + 1.0 03/08/12 20:25:55 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/yapps2.py + + test/option.g + 1.0 03/08/11 20:43:22 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/test/option.g + + test/line_numbers.g + 1.0 03/08/28 00:22:56 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/test/line_numbers.g + + test/empty_clauses.g + 1.0 03/08/27 23:48:11 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/test/empty_clauses.g + + parsetree.py + 1.0 03/08/28 00:18:14 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/parsetree.py + + grammar.py + 1.0 03/08/28 00:16:28 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/grammar.py + + examples/xml.g + 1.0 03/08/27 20:53:39 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/examples/xml.g + + examples/lisp.g + 1.0 03/08/11 20:18:18 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/examples/lisp.g + + examples/expr.g + 1.0 03/08/08 06:47:58 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/examples/expr.g + + examples/calc.g + 1.0 03/08/11 20:17:09 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/examples/calc.g + + NOTES + 1.0 03/08/12 18:59:41 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/NOTES + + LICENSE + 1.0 03/08/11 19:41:27 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/LICENSE + + ChangeLog + 1.0 03/08/28 00:22:16 ?@smurf.noris.de +0 -0 + BitKeeper file /home/smurf/neu/yapps-2.1.1/yapps3/ChangeLog + diff --git a/doc/yapps2.haux b/doc/yapps2.haux new file mode 100644 index 0000000..ad51bcd --- /dev/null +++ b/doc/yapps2.haux @@ -0,0 +1,31 @@ +\@addtocsec{htoc}{1}{0}{\@print{1}\quad{}Introduction} +\@addtocsec{htoc}{2}{0}{\@print{2}\quad{}Examples} +\@addtocsec{htoc}{3}{1}{\@print{2.1}\quad{}Introduction to Grammars} +\@addtocsec{htoc}{4}{1}{\@print{2.2}\quad{}Lisp Expressions} +\@addtocsec{htoc}{5}{1}{\@print{2.3}\quad{}Calculator} +\@addtocsec{htoc}{6}{1}{\@print{2.4}\quad{}Calculator with Memory} +\@addtocsec{htoc}{7}{0}{\@print{3}\quad{}Grammars} +\@addtocsec{htoc}{8}{1}{\@print{3.1}\quad{}Left Factoring} +\newlabel{sec:Left-Factoring}{{3.1}{X}} +\@addtocsec{htoc}{9}{1}{\@print{3.2}\quad{}Left Recursion} +\@addtocsec{htoc}{10}{1}{\@print{3.3}\quad{}Ambiguous Grammars} +\newlabel{sec:Ambiguous-Grammars}{{3.3}{X}} +\@addtocsec{htoc}{11}{0}{\@print{4}\quad{}Customization} +\@addtocsec{htoc}{12}{1}{\@print{4.1}\quad{}Customizing Parsers} +\@addtocsec{htoc}{13}{1}{\@print{4.2}\quad{}Customizing Scanners} +\@addtocsec{htoc}{14}{0}{\@print{5}\quad{}Parser Mechanics} +\@addtocsec{htoc}{15}{1}{\@print{5.1}\quad{}Parser Objects} +\newlabel{sec:Parser-Objects}{{5.1}{X}} +\@addtocsec{htoc}{16}{1}{\@print{5.2}\quad{}Context Sensitive Scanner} +\@addtocsec{htoc}{17}{1}{\@print{5.3}\quad{}Internal Variables} +\@addtocsec{htoc}{18}{1}{\@print{5.4}\quad{}Pre- and Post-Parser Code} +\@addtocsec{htoc}{19}{1}{\@print{5.5}\quad{}Representation of Grammars} +\@addtocsec{htoc}{20}{0}{\@print{A}\quad{}Grammar for Parsers} +\@addtocsec{htoc}{21}{0}{\@print{B}\quad{}Upgrading} +\@addtocsec{htoc}{22}{0}{\@print{C}\quad{}Troubleshooting} +\@addtocsec{htoc}{23}{0}{\@print{D}\quad{}History} +\@addtocsec{htoc}{24}{0}{\@print{E}\quad{}Debian Extensions} +\newlabel{sec:debian}{{E}{X}} +\@addtocsec{htoc}{25}{0}{\@print{F}\quad{}Future Extensions} +\newlabel{sec:future}{{F}{X}} +\@addtocsec{htoc}{26}{0}{\@print{G}\quad{}References} diff --git a/doc/yapps2.html b/doc/yapps2.html new file mode 100644 index 0000000..f95be4d --- /dev/null +++ b/doc/yapps2.html @@ -0,0 +1,1206 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" + "http://www.w3.org/TR/REC-html40/loose.dtd"> +<HTML> + +<HEAD> + + +<META http-equiv="Content-Type" content="text/html; charset=UTF-8"> +<META name="GENERATOR" content="hevea 1.08"> +<STYLE type="text/css"> +.toc{list-style:none;} +.title{margin:auto;text-align:center} +.center{text-align:center;margin-left:auto;margin-right:auto;} +.flushleft{text-align:left;margin-left:0ex;margin-right:auto;} +.flushright{text-align:right;margin-left:auto;margin-right:0ex;} +DIV TABLE{margin-left:inherit;margin-right:inherit;} +PRE{text-align:left;margin-left:0ex;margin-right:auto;} +BLOCKQUOTE{margin-left:4ex;margin-right:4ex;text-align:left;} +.part{margin:auto;text-align:center} +</STYLE> +</HEAD> + +<BODY > +<!--HEVEA command line is: /usr/bin/hevea yapps2.tex --> +<!--HTMLHEAD--> +<!--ENDHTML--> +<!--PREFIX <ARG ></ARG>--> +<!--CUT DEF section 1 --> + +<BR> +<BR> +<DIV CLASS="center"> +<TABLE CELLSPACING=2 CELLPADDING=0> +<TR><TD ALIGN=center NOWRAP><FONT SIZE=5>The <EM>Yapps</EM> Parser Generator System</FONT></TD> +</TR> +<TR><TD ALIGN=center NOWRAP><CODE>http://theory.stanford.edu/~amitp/Yapps/</CODE></TD> +</TR> +<TR><TD ALIGN=center NOWRAP>Version 2</TD> +</TR> +<TR><TD ALIGN=center NOWRAP> </TD> +</TR> +<TR><TD ALIGN=center NOWRAP>Amit J. Patel</TD> +</TR> +<TR><TD ALIGN=center NOWRAP>http://www-cs-students.stanford.edu/ amitp/ +http://www-cs-students.stanford.edu/ amitp/</TD> +</TR></TABLE> <HR SIZE=2> +</DIV><BR> +<BR> +<!--TOC section Introduction--> + +<H2 CLASS="section"><A NAME="htoc1">1</A> Introduction</H2><!--SEC END --> + +<EM>Yapps</EM> (<U>Y</U>et <U>A</U>nother <U>P</U>ython +<U>P</U>arser <U>S</U>ystem) is an easy to use parser +generator that is written in Python and generates Python code. There +are several parser generator systems already available for Python, +including <TT>PyLR, kjParsing, PyBison,</TT> and <TT>mcf.pars,</TT> +but I had different goals for my parser. Yapps is simple, is easy to +use, and produces human-readable parsers. It is not the fastest or +most powerful parser. Yapps is designed to be used when regular +expressions are not enough and other parser systems are too much: +situations where you may write your own recursive descent parser.<BR> +<BR> +Some unusual features of Yapps that may be of interest are: +<OL CLASS="enumerate" type=1><LI CLASS="li-enumerate">Yapps produces recursive descent parsers that are readable by + humans, as opposed to table-driven parsers that are difficult to + read. A Yapps parser for a simple calculator looks similar to the + one that Mark Lutz wrote by hand for <EM>Programming Python.</EM><BR> +<BR> +<LI CLASS="li-enumerate">Yapps also allows for rules that accept parameters and pass + arguments to be used while parsing subexpressions. Grammars that + allow for arguments to be passed to subrules and for values to be + passed back are often called <EM>attribute grammars.</EM> In many + cases parameterized rules can be used to perform actions at “parse + time” that are usually delayed until later. For example, + information about variable declarations can be passed into the + rules that parse a procedure body, so that undefined variables can + be detected at parse time. The types of defined variables can be + used in parsing as well—for example, if the type of <TT>X</TT> is + known, we can determine whether <TT>X(1)</TT> is an array reference or + a function call.<BR> +<BR> +<LI CLASS="li-enumerate">Yapps grammars are fairly easy to write, although there are + some inconveniences having to do with ELL(1) parsing that have to be + worked around. For example, rules have to be left factored and + rules may not be left recursive. However, neither limitation seems + to be a problem in practice. <BR> +<BR> +Yapps grammars look similar to the notation used in the Python + reference manual, with operators like <CODE>*</CODE>, <CODE>+</CODE>, <CODE>|</CODE>, + <CODE>[]</CODE>, and <CODE>()</CODE> for patterns, names (<TT>tim</TT>) for rules, + regular expressions (<CODE>"[a-z]+"</CODE>) for tokens, and <CODE>#</CODE> for + comments.<BR> +<BR> +<LI CLASS="li-enumerate">The Yapps parser generator is written as a single Python module + with no C extensions. Yapps produces parsers that are written + entirely in Python, and require only the Yapps run-time module (5k) + for support.<BR> +<BR> +<LI CLASS="li-enumerate">Yapps's scanner is context-sensitive, picking tokens based on + the types of the tokens accepted by the parser. This can be + helpful when implementing certain kinds of parsers, such as for a + preprocessor.</OL> +There are several disadvantages of using Yapps over another parser system: +<OL CLASS="enumerate" type=1><LI CLASS="li-enumerate">Yapps parsers are <TT>ELL(1)</TT> (Extended LL(1)), which is + less powerful than <TT>LALR</TT> (used by <TT>PyLR</TT>) or + <TT>SLR</TT> (used by <TT>kjParsing</TT>), so Yapps would not be a + good choice for parsing complex languages. For example, allowing + both <TT>x := 5;</TT> and <TT>x;</TT> as statements is difficult + because we must distinguish based on only one token of lookahead. + Seeing only <TT>x</TT>, we cannot decide whether we have an + assignment statement or an expression statement. (Note however + that this kind of grammar can be matched with backtracking; see + section <A HREF="#sec:future">F</A>.)<BR> +<BR> +<LI CLASS="li-enumerate">The scanner that Yapps provides can only read from strings, not + files, so an entire file has to be read in before scanning can + begin. It is possible to build a custom scanner, though, so in + cases where stream input is needed (from the console, a network, or + a large file are examples), the Yapps parser can be given a custom + scanner that reads from a stream instead of a string.<BR> +<BR> +<LI CLASS="li-enumerate">Yapps is not designed with efficiency in mind.</OL> +Yapps provides an easy to use parser generator that produces parsers +similar to what you might write by hand. It is not meant to be a +solution for all parsing problems, but instead an aid for those times +you would write a parser by hand rather than using one of the more +powerful parsing packages available.<BR> +<BR> +Yapps 2.0 is easier to use than Yapps 1.0. New features include a +less restrictive input syntax, which allows mixing of sequences, +choices, terminals, and nonterminals; optional matching; the ability +to insert single-line statements into the generated parser; and +looping constructs <CODE>*</CODE> and <CODE>+</CODE> similar to the repetitive +matching constructs in regular expressions. Unfortunately, the +addition of these constructs has made Yapps 2.0 incompatible with +Yapps 1.0, so grammars will have to be rewritten. See section +<A HREF="#sec:Upgrading">??</A> for tips on changing Yapps 1.0 grammars for use +with Yapps 2.0.<BR> +<BR> +<!--TOC section Examples--> + +<H2 CLASS="section"><A NAME="htoc2">2</A> Examples</H2><!--SEC END --> + +In this section are several examples that show the use of Yapps. +First, an introduction shows how to construct grammars and write them +in Yapps form. This example can be skipped by someone familiar with +grammars and parsing. Next is a Lisp expression grammar that produces +a parse tree as output. This example demonstrates the use of tokens +and rules, as well as returning values from rules. The third example +is a expression evaluation grammar that evaluates during parsing +(instead of producing a parse tree).<BR> +<BR> +<!--TOC subsection Introduction to Grammars--> + +<H3 CLASS="subsection"><A NAME="htoc3">2.1</A> Introduction to Grammars</H3><!--SEC END --> + +A <EM>grammar</EM> for a natural language specifies how words can be put +together to form large structures, such as phrases and sentences. A +grammar for a computer language is similar in that it specifies how +small components (called <EM>tokens</EM>) can be put together to form +larger structures. In this section we will write a grammar for a tiny +subset of English.<BR> +<BR> +Simple English sentences can be described as being a noun phrase +followed by a verb followed by a noun phrase. For example, in the +sentence, “Jack sank the blue ship,” the word “Jack” is the first +noun phrase, “sank” is the verb, and “the blue ship” is the second +noun phrase. In addition we should say what a noun phrase is; for +this example we shall say that a noun phrase is an optional article +(a, an, the) followed by any number of adjectives followed by a noun. +The tokens in our language are the articles, nouns, verbs, and +adjectives. The <EM>rules</EM> in our language will tell us how to +combine the tokens together to form lists of adjectives, noun phrases, +and sentences: +<UL CLASS="itemize"><LI CLASS="li-itemize"> + <TT>sentence: noun_phrase verb noun_phrase</TT> + <LI CLASS="li-itemize"><TT>noun_phrase: [article] adjective* noun</TT> +</UL> +Notice that some things that we said easily in English, such as +“optional article,” are expressed using special syntax, such as +brackets. When we said, “any number of adjectives,” we wrote +<TT>adjective*</TT>, where the <TT>*</TT> means “zero or more of the +preceding pattern”.<BR> +<BR> +The grammar given above is close to a Yapps grammar. We also have to +specify what the tokens are, and what to do when a pattern is matched. +For this example, we will do nothing when patterns are matched; the +next example will explain how to perform match actions. +<PRE CLASS="verbatim"> +parser TinyEnglish: + ignore: "\\W+" + token noun: "(Jack|spam|ship)" + token verb: "(sank|threw)" + token article: "(a|an|the)" + token adjective: "(blue|red|green)" + + rule sentence: noun_phrase verb noun_phrase + rule noun_phrase: [article] adjective* noun +</PRE> +The tokens are specified as Python <EM>regular expressions</EM>. Since +Yapps produces Python code, you can write any regular expression that +would be accepted by Python. (<EM>Note:</EM> These are Python 1.5 +regular expressions from the <TT>re</TT> module, not Python 1.4 +regular expressions from the <TT>regex</TT> module.) In addition to +tokens that you want to see (which are given names), you can also +specify tokens to ignore, marked by the <TT>ignore</TT> keyword. In +this parser we want to ignore whitespace.<BR> +<BR> +The TinyEnglish grammar shows how you define tokens and rules, but it +does not specify what should happen once we've matched the rules. In +the next example, we will take a grammar and produce a <EM>parse +tree</EM> from it.<BR> +<BR> +<!--TOC subsection Lisp Expressions--> + +<H3 CLASS="subsection"><A NAME="htoc4">2.2</A> Lisp Expressions</H3><!--SEC END --> + +Lisp syntax, although hated by many, has a redeeming quality: it is +simple to parse. In this section we will construct a Yapps grammar to +parse Lisp expressions and produce a parse tree as output.<BR> +<BR> +<!--TOC subsubsection Defining the Grammar--> + +<H4 CLASS="subsubsection">Defining the Grammar</H4><!--SEC END --> + +The syntax of Lisp is simple. It has expressions, which are +identifiers, strings, numbers, and lists. A list is a left +parenthesis followed by some number of expressions (separated by +spaces) followed by a right parenthesis. For example, <CODE>5</CODE>, +<CODE>"ni"</CODE>, and <CODE>(print "1+2 = " (+ 1 2))</CODE> are Lisp expressions. +Written as a grammar, +<PRE CLASS="verbatim"> + expr: ID | STR | NUM | list + list: ( expr* ) +</PRE> +In addition to having a grammar, we need to specify what to do every +time something is matched. For the tokens, which are strings, we just +want to get the “value” of the token, attach its type (identifier, +string, or number) in some way, and return it. For the lists, we want +to construct and return a Python list.<BR> +<BR> +Once some pattern is matched, we enclose a return statement enclosed +in <CODE>{{...}}</CODE>. The braces allow us to insert any one-line +statement into the parser. Within this statement, we can refer to the +values returned by matching each part of the rule. After matching a +token such as <TT>ID</TT>, “ID” will be bound to the text of the +matched token. Let's take a look at the rule: +<PRE CLASS="verbatim"> + rule expr: ID {{ return ('id', ID) }} + ... +</PRE> +In a rule, tokens return the text that was matched. For identifiers, +we just return the identifier, along with a “tag” telling us that +this is an identifier and not a string or some other value. Sometimes +we may need to convert this text to a different form. For example, if +a string is matched, we want to remove quotes and handle special forms +like <CODE>\n</CODE>. If a number is matched, we want to convert it into a +number. Let's look at the return values for the other tokens: +<PRE CLASS="verbatim"> + ... + | STR {{ return ('str', eval(STR)) }} + | NUM {{ return ('num', atoi(NUM)) }} + ... +</PRE> +If we get a string, we want to remove the quotes and process any +special backslash codes, so we run <TT>eval</TT> on the quoted string. +If we get a number, we convert it to an integer with <TT>atoi</TT> and +then return the number along with its type tag.<BR> +<BR> +For matching a list, we need to do something slightly more +complicated. If we match a Lisp list of expressions, we want to +create a Python list with those values. +<PRE CLASS="verbatim"> + rule list: "\\(" # Match the opening parenthesis + {{ result = [] }} # Create a Python list + ( + expr # When we match an expression, + {{ result.append(expr) }} # add it to the list + )* # * means repeat this if needed + "\\)" # Match the closing parenthesis + {{ return result }} # Return the Python list +</PRE> +In this rule we first match the opening parenthesis, then go into a +loop. In this loop we match expressions and add them to the list. +When there are no more expressions to match, we match the closing +parenthesis and return the resulting. Note that <CODE>#</CODE> is used for +comments, just as in Python.<BR> +<BR> +The complete grammar is specified as follows: +<PRE CLASS="verbatim"> +parser Lisp: + ignore: '\\s+' + token NUM: '[0-9]+' + token ID: '[-+*/!@%^&=.a-zA-Z0-9_]+' + token STR: '"([^\\"]+|\\\\.)*"' + + rule expr: ID {{ return ('id', ID) }} + | STR {{ return ('str', eval(STR)) }} + | NUM {{ return ('num', atoi(NUM)) }} + | list {{ return list }} + rule list: "\\(" {{ result = [] }} + ( expr {{ result.append(expr) }} + )* + "\\)" {{ return result }} +</PRE> +One thing you may have noticed is that <CODE>"\\("</CODE> and <CODE>"\\)"</CODE> +appear in the <TT>list</TT> rule. These are <EM>inline tokens</EM>: +they appear in the rules without being given a name with the +<TT>token</TT> keyword. Inline tokens are more convenient to use, but +since they do not have a name, the text that is matched cannot be used +in the return value. They are best used for short simple patterns +(usually punctuation or keywords).<BR> +<BR> +Another thing to notice is that the number and identifier tokens +overlap. For example, “487” matches both NUM and ID. In Yapps, the +scanner only tries to match tokens that are acceptable to the parser. +This rule doesn't help here, since both NUM and ID can appear in the +same place in the grammar. There are two rules used to pick tokens if +more than one matches. One is that the <EM>longest</EM> match is +preferred. For example, “487x” will match as an ID (487x) rather +than as a NUM (487) followed by an ID (x). The second rule is that if +the two matches are the same length, the <EM>first</EM> one listed in +the grammar is preferred. For example, “487” will match as an NUM +rather than an ID because NUM is listed first in the grammar. Inline +tokens have preference over any tokens you have listed.<BR> +<BR> +Now that our grammar is defined, we can run Yapps to produce a parser, +and then run the parser to produce a parse tree.<BR> +<BR> +<!--TOC subsubsection Running Yapps--> + +<H4 CLASS="subsubsection">Running Yapps</H4><!--SEC END --> + +In the Yapps module is a function <TT>generate</TT> that takes an +input filename and writes a parser to another file. We can use this +function to generate the Lisp parser, which is assumed to be in +<TT>lisp.g</TT>. +<PRE CLASS="verbatim"> +% python +Python 1.5.1 (#1, Sep 3 1998, 22:51:17) [GCC 2.7.2.3] on linux-i386 +Copyright 1991-1995 Stichting Mathematisch Centrum, Amsterdam +>>> import yapps +>>> yapps.generate('lisp.g') +</PRE> +At this point, Yapps has written a file <TT>lisp.py</TT> that contains +the parser. In that file are two classes (one scanner and one parser) +and a function (called <TT>parse</TT>) that puts things together for +you.<BR> +<BR> +Alternatively, we can run Yapps from the command line to generate the +parser file: +<PRE CLASS="verbatim"> +% python yapps.py lisp.g +</PRE> +After running Yapps either from within Python or from the command +line, we can use the Lisp parser by calling the <TT>parse</TT> +function. The first parameter should be the rule we want to match, +and the second parameter should be the string to parse. +<PRE CLASS="verbatim"> +>>> import lisp +>>> lisp.parse('expr', '(+ 3 4)') +[('id', '+'), ('num', 3), ('num', 4)] +>>> lisp.parse('expr', '(print "3 = " (+ 1 2))') +[('id', 'print'), ('str', '3 = '), [('id', '+'), ('num', 1), ('num', 2)]] +</PRE> +The <TT>parse</TT> function is not the only way to use the parser; +section <A HREF="#sec:Parser-Objects">5.1</A> describes how to access parser objects +directly.<BR> +<BR> +We've now gone through the steps in creating a grammar, writing a +grammar file for Yapps, producing a parser, and using the parser. In +the next example we'll see how rules can take parameters and also how +to do computations instead of just returning a parse tree.<BR> +<BR> +<!--TOC subsection Calculator--> + +<H3 CLASS="subsection"><A NAME="htoc5">2.3</A> Calculator</H3><!--SEC END --> + +A common example parser given in many textbooks is that for simple +expressions, with numbers, addition, subtraction, multiplication, +division, and parenthesization of subexpressions. We'll write this +example in Yapps, evaluating the expression as we parse.<BR> +<BR> +Unlike <TT>yacc</TT>, Yapps does not have any way to specify +precedence rules, so we have to do it ourselves. We say that an +expression is the sum of terms, and that a term is the product of +factors, and that a factor is a number or a parenthesized expression: +<PRE CLASS="verbatim"> + expr: factor ( ("+"|"-") factor )* + factor: term ( ("*"|"/") term )* + term: NUM | "(" expr ")" +</PRE> +In order to evaluate the expression as we go, we should keep along an +accumulator while evaluating the lists of terms or factors. Just as +we kept a “result” variable to build a parse tree for Lisp +expressions, we will use a variable to evaluate numerical +expressions. The full grammar is given below: +<PRE CLASS="verbatim"> +parser Calculator: + token END: "$" # $ means end of string + token NUM: "[0-9]+" + + rule goal: expr END {{ return expr }} + + # An expression is the sum and difference of factors + rule expr: factor {{ v = factor }} + ( "[+]" factor {{ v = v+factor }} + | "-" factor {{ v = v-factor }} + )* {{ return v }} + + # A factor is the product and division of terms + rule factor: term {{ v = term }} + ( "[*]" term {{ v = v*term }} + | "/" term {{ v = v/term }} + )* {{ return v }} + + # A term is either a number or an expression surrounded by parentheses + rule term: NUM {{ return atoi(NUM) }} + | "\\(" expr "\\)" {{ return expr }} +</PRE> +The top-level rule is <EM>goal</EM>, which says that we are looking for +an expression followed by the end of the string. The <TT>END</TT> +token is needed because without it, it isn't clear when to stop +parsing. For example, the string “1+3” could be parsed either as +the expression “1” followed by the string “+3” or it could be +parsed as the expression “1+3”. By requiring expressions to end +with <TT>END</TT>, the parser is forced to take “1+3”.<BR> +<BR> +In the two rules with repetition, the accumulator is named <TT>v</TT>. +After reading in one expression, we initialize the accumulator. Each +time through the loop, we modify the accumulator by adding, +subtracting, multiplying by, or dividing the previous accumulator by +the expression that has been parsed. At the end of the rule, we +return the accumulator.<BR> +<BR> +The calculator example shows how to process lists of elements using +loops, as well as how to handle precedence of operators.<BR> +<BR> +<EM>Note:</EM> It's often important to put the <TT>END</TT> token in, so +put it in unless you are sure that your grammar has some other +non-ambiguous token marking the end of the program.<BR> +<BR> +<!--TOC subsection Calculator with Memory--> + +<H3 CLASS="subsection"><A NAME="htoc6">2.4</A> Calculator with Memory</H3><!--SEC END --> + +In the previous example we learned how to write a calculator that +evaluates simple numerical expressions. In this section we will +extend the example to support both local and global variables.<BR> +<BR> +To support global variables, we will add assignment statements to the +“goal” rule. +<PRE CLASS="verbatim"> + rule goal: expr END {{ return expr }} + | 'set' ID expr END {{ global_vars[ID] = expr }} + {{ return expr }} +</PRE> +To use these variables, we need a new kind of terminal: +<PRE CLASS="verbatim"> + rule term: ... | ID {{ return global_vars[ID] }} +</PRE> +So far, these changes are straightforward. We simply have a global +dictionary <TT>global_vars</TT> that stores the variables and values, +we modify it when there is an assignment statement, and we look up +variables in it when we see a variable name.<BR> +<BR> +To support local variables, we will add variable declarations to the +set of allowed expressions. +<PRE CLASS="verbatim"> + rule term: ... | 'let' VAR '=' expr 'in' expr ... +</PRE> +This is where it becomes tricky. Local variables should be stored in +a local dictionary, not in the global one. One trick would be to save +a copy of the global dictionary, modify it, and then restore it +later. In this example we will instead use <EM>attributes</EM> to +create local information and pass it to subrules.<BR> +<BR> +A rule can optionally take parameters. When we invoke the rule, we +must pass in arguments. For local variables, let's use a single +parameter, <TT>local_vars</TT>: +<PRE CLASS="verbatim"> + rule expr<<local_vars>>: ... + rule factor<<local_vars>>: ... + rule term<<local_vars>>: ... +</PRE> +Each time we want to match <TT>expr</TT>, <TT>factor</TT>, or +<TT>term</TT>, we will pass the local variables in the current rule to +the subrule. One interesting case is when we pass as an argument +something <EM>other</EM> than <TT>local_vars</TT>: +<PRE CLASS="verbatim"> + rule term<<local_vars>>: ... + | 'let' VAR '=' expr<<local_vars>> + {{ local_vars = [(VAR, expr)] + local_vars }} + 'in' expr<<local_vars>> + {{ return expr }} +</PRE> +Note that the assignment to the local variables list does not modify +the original list. This is important to keep local variables from +being seen outside the “let”.<BR> +<BR> +The other interesting case is when we find a variable: +<PRE CLASS="verbatim"> +global_vars = {} + +def lookup(map, name): + for x,v in map: if x==name: return v + return global_vars[name] +%% + ... + rule term<<local_vars>: ... + | VAR {{ return lookup(local_vars, VAR) }} +</PRE> +The lookup function will search through the local variable list, and +if it cannot find the name there, it will look it up in the global +variable dictionary.<BR> +<BR> +A complete grammar for this example, including a read-eval-print loop +for interacting with the calculator, can be found in the examples +subdirectory included with Yapps.<BR> +<BR> +In this section we saw how to insert code before the parser. We also +saw how to use attributes to transmit local information from one rule +to its subrules.<BR> +<BR> +<!--TOC section Grammars--> + +<H2 CLASS="section"><A NAME="htoc7">3</A> Grammars</H2><!--SEC END --> + +Each Yapps grammar has a name, a list of tokens, and a set of +production rules. A grammar named <TT>X</TT> will be used to produce +a parser named <TT>X</TT> and a scanner anmed <TT>XScanner</TT>. As +in Python, names are case sensitive, start with a letter, and contain +letters, numbers, and underscores (_).<BR> +<BR> +There are three kinds of tokens in Yapps: named, inline, and ignored. +As their name implies, named tokens are given a name, using the token +construct: <TT>token <EM>name</EM> : <EM>regexp</EM></TT>. In a rule, the +token can be matched by using the name. Inline tokens are regular +expressions that are used in rules without being declared. Ignored +tokens are declared using the ignore construct: <TT>ignore: + <EM>regexp</EM></TT>. These tokens are ignored by the scanner, and are +not seen by the parser. Often whitespace is an ignored token. The +regular expressions used to define tokens should use the syntax +defined in the <TT>re</TT> module, so some symbols may have to be +backslashed.<BR> +<BR> +Production rules in Yapps have a name and a pattern to match. If the +rule is parameterized, the name should be followed by a list of +parameter names in <CODE><<...>></CODE>. A pattern can be a simple pattern +or a compound pattern. Simple patterns are the name of a named token, +a regular expression in quotes (inline token), the name of a +production rule (followed by arguments in <CODE><<...>></CODE>, if the rule +has parameters), and single line Python statements (<CODE>{{...}}</CODE>). +Compound patterns are sequences (<CODE>A B C ...</CODE>), choices ( +<CODE>A | B | C | ...</CODE>), options (<CODE>[...]</CODE>), zero-or-more repetitions +(<CODE>...*</CODE>), and one-or-more repetitions (<CODE>...+</CODE>). Like +regular expressions, repetition operators have a higher precedence +than sequences, and sequences have a higher precedence than choices.<BR> +<BR> +Whenever <CODE>{{...}}</CODE> is used, a legal one-line Python statement +should be put inside the braces. The token <CODE>}}</CODE> should not +appear within the <CODE>{{...}}</CODE> section, even within a string, since +Yapps does not attempt to parse the Python statement. A workaround +for strings is to put two strings together (<CODE>"}" "}"</CODE>), or to use +backslashes (<CODE>"}\}"</CODE>). At the end of a rule you should use a +<CODE>{{ return X }}</CODE> statement to return a value. However, you +should <EM>not</EM> use any control statements (<TT>return</TT>, +<TT>continue</TT>, <TT>break</TT>) in the middle of a rule. Yapps +needs to make assumptions about the control flow to generate a parser, +and any changes to the control flow will confuse Yapps.<BR> +<BR> +The <CODE><<...>></CODE> form can occur in two places: to define parameters +to a rule and to give arguments when matching a rule. Parameters use +the syntax used for Python functions, so they can include default +arguments and the special forms (<CODE>*args</CODE> and <CODE>**kwargs</CODE>). +Arguments use the syntax for Python function call arguments, so they +can include normal arguments and keyword arguments. The token +<CODE>>></CODE> should not appear within the <CODE><<...>></CODE> section.<BR> +<BR> +In both the statements and rule arguments, you can use names defined +by the parser to refer to matched patterns. You can refer to the text +matched by a named token by using the token name. You can use the +value returned by a production rule by using the name of that rule. +If a name <TT>X</TT> is matched more than once (such as in loops), you +will have to save the earlier value(s) in a temporary variable, and +then use that temporary variable in the return value. The next +section has an example of a name that occurs more than once.<BR> +<BR> +<!--TOC subsection Left Factoring--> + +<H3 CLASS="subsection"><A NAME="htoc8">3.1</A> Left Factoring</H3><!--SEC END --> + +<A NAME="sec:Left-Factoring"></A> +Yapps produces ELL(1) parsers, which determine which clause to match +based on the first token available. Sometimes the leftmost tokens of +several clauses may be the same. The classic example is the +<EM>if/then/else</EM> construct in Pascal: +<PRE CLASS="verbatim"> +rule stmt: "if" expr "then" stmt {{ then_part = stmt }} + "else" stmt {{ return ('If',expr,then_part,stmt) }} + | "if" expr "then" stmt {{ return ('If',expr,stmt,[]) }} +</PRE> +(Note that we have to save the first <TT>stmt</TT> into a variable +because there is another <TT>stmt</TT> that will be matched.) The +left portions of the two clauses are the same, which presents a +problem for the parser. The solution is <EM>left-factoring</EM>: the +common parts are put together, and <EM>then</EM> a choice is made about +the remaining part: +<PRE CLASS="verbatim"> +rule stmt: "if" expr + "then" stmt {{ then_part = stmt }} + {{ else_part = [] }} + [ "else" stmt {{ else_part = stmt }} ] + {{ return ('If', expr, then_part, else_part) }} +</PRE> +Unfortunately, the classic <EM>if/then/else</EM> situation is +<EM>still</EM> ambiguous when you left-factor. Yapps can deal with this +situation, but will report a warning; see section +<A HREF="#sec:Ambiguous-Grammars">3.3</A> for details.<BR> +<BR> +In general, replace rules of the form: +<PRE CLASS="verbatim"> +rule A: a b1 {{ return E1 }} + | a b2 {{ return E2 }} + | c3 {{ return E3 }} + | c4 {{ return E4 }} +</PRE> +with rules of the form: +<PRE CLASS="verbatim"> +rule A: a ( b1 {{ return E1 }} + | b2 {{ return E2 }} + ) + | c3 {{ return E3 }} + | c4 {{ return E4 }} +</PRE> +<!--TOC subsection Left Recursion--> + +<H3 CLASS="subsection"><A NAME="htoc9">3.2</A> Left Recursion</H3><!--SEC END --> + +A common construct in grammars is for matching a list of patterns, +sometimes separated with delimiters such as commas or semicolons. In +LR-based parser systems, we can parse a list with something like this: +<PRE CLASS="verbatim"> +rule sum: NUM {{ return NUM }} + | sum "+" NUM {{ return (sum, NUM) }} +</PRE> +Parsing <TT>1+2+3+4</TT> would produce the output +<TT>(((1,2),3),4)</TT>, which is what we want from a left-associative +addition operator. Unfortunately, this grammar is <EM>left +recursive,</EM> because the <TT>sum</TT> rule contains a clause that +begins with <TT>sum</TT>. (The recursion occurs at the left side of +the clause.)<BR> +<BR> +We must restructure this grammar to be <EM>right recursive</EM> instead: +<PRE CLASS="verbatim"> +rule sum: NUM {{ return NUM }} + | NUM "+" sum {{ return (NUM, sum) }} +</PRE> +Unfortunately, using this grammar, <TT>1+2+3+4</TT> would be parsed as +<TT>(1,(2,(3,4)))</TT>, which no longer follows left associativity. +The rule also needs to be left-factored. Instead, we write the +pattern as a loop instead: +<PRE CLASS="verbatim"> +rule sum: NUM {{ v = NUM }} + ( "[+]" NUM {{ v = (v,NUM) }} )* + {{ return v }} +</PRE> +In general, replace rules of the form: +<PRE CLASS="verbatim"> +rule A: A a1 -> << E1 >> + | A a2 -> << E2 >> + | b3 -> << E3 >> + | b4 -> << E4 >> +</PRE> +with rules of the form: +<PRE CLASS="verbatim"> +rule A: ( b3 {{ A = E3 }} + | b4 {{ A = E4 }} ) + ( a1 {{ A = E1 }} + | a2 {{ A = E2 }} )* + {{ return A }} +</PRE> +We have taken a rule that proved problematic for with recursion and +turned it into a rule that works well with looping constructs.<BR> +<BR> +<!--TOC subsection Ambiguous Grammars--> + +<H3 CLASS="subsection"><A NAME="htoc10">3.3</A> Ambiguous Grammars</H3><!--SEC END --> + +<A NAME="sec:Ambiguous-Grammars"></A> +In section <A HREF="#sec:Left-Factoring">3.1</A> we saw the classic if/then/else +ambiguity, which occurs because the “else ...” portion of an “if +...then ...else ...” construct is optional. Programs with +nested if/then/else constructs can be ambiguous when one of the else +clauses is missing: +<PRE CLASS="verbatim"> +if 1 then if 1 then + if 5 then if 5 then + x := 1; x := 1; + else else + y := 9; y := 9; +</PRE> +The indentation shows that the program can be parsed in two different +ways. (Of course, if we all would adopt Python's indentation-based +structuring, this would never happen!) Usually we want the parsing on +the left: the “else” should be associated with the closest “if” +statement. In section <A HREF="#sec:Left-Factoring">3.1</A> we “solved” the +problem by using the following grammar: +<PRE CLASS="verbatim"> +rule stmt: "if" expr + "then" stmt {{ then_part = stmt }} + {{ else_part = [] }} + [ "else" stmt {{ else_part = stmt }} ] + {{ return ('If', expr, then_part, else_part) }} +</PRE> +Here, we have an optional match of “else” followed by a statement. +The ambiguity is that if an “else” is present, it is not clear +whether you want it parsed immediately or if you want it to be parsed +by the outer “if”.<BR> +<BR> +Yapps will deal with the situation by matching when the else pattern +when it can. The parser will work in this case because it prefers the +<EM>first</EM> matching clause, which tells Yapps to parse the “else”. +That is exactly what we want!<BR> +<BR> +For ambiguity cases with choices, Yapps will choose the <EM>first</EM> +matching choice. However, remember that Yapps only looks at the first +token to determine its decision, so <TT>(a b | a c)</TT> will result in +Yapps choosing <TT>a b</TT> even when the input is <TT>a c</TT>. It only +looks at the first token, <TT>a</TT>, to make its decision.<BR> +<BR> +<!--TOC section Customization--> + +<H2 CLASS="section"><A NAME="htoc11">4</A> Customization</H2><!--SEC END --> + +Both the parsers and the scanners can be customized. The parser is +usually extended by subclassing, and the scanner can either be +subclassed or completely replaced.<BR> +<BR> +<!--TOC subsection Customizing Parsers--> + +<H3 CLASS="subsection"><A NAME="htoc12">4.1</A> Customizing Parsers</H3><!--SEC END --> + +If additional fields and methods are needed in order for a parser to +work, Python subclassing can be used. (This is unlike parser classes +written in static languages, in which these fields and methods must be +defined in the generated parser class.) We simply subclass the +generated parser, and add any fields or methods required. Expressions +in the grammar can call methods of the subclass to perform any actions +that cannot be expressed as a simple expression. For example, +consider this simple grammar: +<PRE CLASS="verbatim"> +parser X: + rule goal: "something" {{ self.printmsg() }} +</PRE> +The <TT>printmsg</TT> function need not be implemented in the parser +class <TT>X</TT>; it can be implemented in a subclass: +<PRE CLASS="verbatim"> +import Xparser + +class MyX(Xparser.X): + def printmsg(self): + print "Hello!" +</PRE> +<!--TOC subsection Customizing Scanners--> + +<H3 CLASS="subsection"><A NAME="htoc13">4.2</A> Customizing Scanners</H3><!--SEC END --> + +The generated parser class is not dependent on the generated scanner +class. A scanner object is passed to the parser object's constructor +in the <TT>parse</TT> function. To use a different scanner, write +your own function to construct parser objects, with an instance of a +different scanner. Scanner objects must have a <TT>token</TT> method +that accepts an integer <TT>N</TT> as well as a list of allowed token +types, and returns the Nth token, as a tuple. The default scanner +raises <TT>NoMoreTokens</TT> if no tokens are available, and +<TT>SyntaxError</TT> if no token could be matched. However, the +parser does not rely on these exceptions; only the <TT>parse</TT> +convenience function (which calls <TT>wrap_error_reporter</TT>) and +the <TT>print_error</TT> error display function use those exceptions.<BR> +<BR> +The tuples representing tokens have four elements. The first two are +the beginning and ending indices of the matched text in the input +string. The third element is the type tag, matching either the name +of a named token or the quoted regexp of an inline or ignored token. +The fourth element of the token tuple is the matched text. If the +input string is <TT>s</TT>, and the token tuple is +<TT>(b,e,type,val)</TT>, then <TT>val</TT> should be equal to +<TT>s[b:e]</TT>.<BR> +<BR> +The generated parsers do not the beginning or ending index. They use +only the token type and value. However, the default error reporter +uses the beginning and ending index to show the user where the error +is.<BR> +<BR> +<!--TOC section Parser Mechanics--> + +<H2 CLASS="section"><A NAME="htoc14">5</A> Parser Mechanics</H2><!--SEC END --> + +The base parser class (Parser) defines two methods, <TT>_scan</TT> +and <TT>_peek</TT>, and two fields, <TT>_pos</TT> and +<TT>_scanner</TT>. The generated parser inherits from the base +parser, and contains one method for each rule in the grammar. To +avoid name clashes, do not use names that begin with an underscore +(<TT>_</TT>).<BR> +<BR> +<!--TOC subsection Parser Objects--> + +<H3 CLASS="subsection"><A NAME="htoc15">5.1</A> Parser Objects</H3><!--SEC END --> + +<A NAME="sec:Parser-Objects"></A> +Yapps produces as output two exception classes, a scanner class, a +parser class, and a function <TT>parse</TT> that puts everything +together. The <TT>parse</TT> function does not have to be used; +instead, one can create a parser and scanner object and use them +together for parsing. +<PRE CLASS="verbatim"> + def parse(rule, text): + P = X(XScanner(text)) + return wrap_error_reporter(P, rule) +</PRE> +The <TT>parse</TT> function takes a name of a rule and an input string +as input. It creates a scanner and parser object, then calls +<TT>wrap_error_reporter</TT> to execute the method in the parser +object named <TT>rule</TT>. The wrapper function will call the +appropriate parser rule and report any parsing errors to standard +output.<BR> +<BR> +There are several situations in which the <TT>parse</TT> function +would not be useful. If a different parser or scanner is being used, +or exceptions are to be handled differently, a new <TT>parse</TT> +function would be required. The supplied <TT>parse</TT> function can +be used as a template for writing a function for your own needs. An +example of a custom parse function is the <TT>generate</TT> function +in <TT>Yapps.py</TT>.<BR> +<BR> +<!--TOC subsection Context Sensitive Scanner--> + +<H3 CLASS="subsection"><A NAME="htoc16">5.2</A> Context Sensitive Scanner</H3><!--SEC END --> + +Unlike most scanners, the scanner produced by Yapps can take into +account the context in which tokens are needed, and try to match only +good tokens. For example, in the grammar: +<PRE CLASS="verbatim"> +parser IniFile: + token ID: "[a-zA-Z_0-9]+" + token VAL: ".*" + + rule pair: ID "[ \t]*=[ \t]*" VAL "\n" +</PRE> +we would like to scan lines of text and pick out a name/value pair. +In a conventional scanner, the input string <TT>shell=progman.exe</TT> +would be turned into a single token of type <TT>VAL</TT>. The Yapps +scanner, however, knows that at the beginning of the line, an +<TT>ID</TT> is expected, so it will return <TT>"shell"</TT> as a token +of type <TT>ID</TT>. Later, it will return <TT>"progman.exe"</TT> as +a token of type <TT>VAL</TT>.<BR> +<BR> +Context sensitivity decreases the separation between scanner and +parser, but it is useful in parsers like <TT>IniFile</TT>, where the +tokens themselves are not unambiguous, but <EM>are</EM> unambiguous +given a particular stage in the parsing process.<BR> +<BR> +Unfortunately, context sensitivity can make it more difficult to +detect errors in the input. For example, in parsing a Pascal-like +language with “begin” and “end” as keywords, a context sensitive +scanner would only match “end” as the END token if the parser is in +a place that will accept the END token. If not, then the scanner +would match “end” as an identifier. To disable the context +sensitive scanner in Yapps, add the +<TT>context-insensitive-scanner</TT> option to the grammar: +<PRE CLASS="verbatim"> +Parser X: + option: "context-insensitive-scanner" +</PRE> +Context-insensitive scanning makes the parser look cleaner as well.<BR> +<BR> +<!--TOC subsection Internal Variables--> + +<H3 CLASS="subsection"><A NAME="htoc17">5.3</A> Internal Variables</H3><!--SEC END --> + +There are two internal fields that may be of use. The parser object +has two fields, <TT>_pos</TT>, which is the index of the current +token being matched, and <TT>_scanner</TT>, which is the scanner +object. The token itself can be retrieved by accessing the scanner +object and calling the <TT>token</TT> method with the token index. However, if you call <TT>token</TT> before the token has been requested by the parser, it may mess up a context-sensitive scanner.<SUP><A NAME="text1" HREF="#note1">1</A></SUP> A +potentially useful combination of these fields is to extract the +portion of the input matched by the current rule. To do this, just save the scanner state (<TT>_scanner.pos</TT>) before the text is matched and then again after the text is matched: +<PRE CLASS="verbatim"> + rule R: + {{ start = self._scanner.pos }} + a b c + {{ end = self._scanner.pos }} + {{ print 'Text is', self._scanner.input[start:end] }} +</PRE> +<!--TOC subsection Pre- and Post-Parser Code--> + +<H3 CLASS="subsection"><A NAME="htoc18">5.4</A> Pre- and Post-Parser Code</H3><!--SEC END --> + +Sometimes the parser code needs to rely on helper variables, +functions, and classes. A Yapps grammar can optionally be surrounded +by double percent signs, to separate the grammar from Python code. +<PRE CLASS="verbatim"> +... Python code ... +%% +... Yapps grammar ... +%% +... Python code ... +</PRE> +The second <CODE>%%</CODE> can be omitted if there is no Python code at the +end, and the first <CODE>%%</CODE> can be omitted if there is no extra +Python code at all. (To have code only at the end, both separators +are required.)<BR> +<BR> +If the second <CODE>%%</CODE> is omitted, Yapps will insert testing code +that allows you to use the generated parser to parse a file.<BR> +<BR> +The extended calculator example in the Yapps examples subdirectory +includes both pre-parser and post-parser code.<BR> +<BR> +<!--TOC subsection Representation of Grammars--> + +<H3 CLASS="subsection"><A NAME="htoc19">5.5</A> Representation of Grammars</H3><!--SEC END --> + +For each kind of pattern there is a class derived from Pattern. Yapps +has classes for Terminal, NonTerminal, Sequence, Choice, Option, Plus, +Star, and Eval. Each of these classes has the following interface: +<UL CLASS="itemize"><LI CLASS="li-itemize"> + setup(<EM>gen</EM>) Set accepts-є, and call + <EM>gen.changed()</EM> if it changed. This function can change the + flag from false to true but <EM>not</EM> from true to false. + <LI CLASS="li-itemize">update(<EM>(</EM>gen)) Set <SPAN STYLE="font-variant:small-caps">first</SPAN>and <SPAN STYLE="font-variant:small-caps">follow</SPAN>, and call + <EM>gen.changed()</EM> if either changed. This function can add to + the sets but <EM>not</EM> remove from them. + <LI CLASS="li-itemize">output(<EM>gen</EM>, <EM>indent</EM>) Generate code for matching + this rule, using <EM>indent</EM> as the current indentation level. + Writes are performed using <EM>gen.write</EM>. + <LI CLASS="li-itemize">used(<EM>vars</EM>) Given a list of variables <EM>vars</EM>, + return two lists: one containing the variables that are used, and + one containing the variables that are assigned. This function is + used for optimizing the resulting code. +</UL> +Both <EM>setup</EM> and <EM>update</EM> monotonically increase the +variables they modify. Since the variables can only increase a finite +number of times, we can repeatedly call the function until the +variable stabilized. The <EM>used</EM> function is not currently +implemented.<BR> +<BR> +With each pattern in the grammar Yapps associates three pieces of +information: the <SPAN STYLE="font-variant:small-caps">first</SPAN>set, the <SPAN STYLE="font-variant:small-caps">follow</SPAN>set, and the +accepts-є flag.<BR> +<BR> +The <SPAN STYLE="font-variant:small-caps">first</SPAN>set contains the tokens that can appear as we start +matching the pattern. The <SPAN STYLE="font-variant:small-caps">follow</SPAN>set contains the tokens that can +appear immediately after we match the pattern. The accepts-є +flag is true if the pattern can match no tokens. In this case, <SPAN STYLE="font-variant:small-caps">first</SPAN>will contain all the elements in <SPAN STYLE="font-variant:small-caps">follow</SPAN>. The <SPAN STYLE="font-variant:small-caps">follow</SPAN>set is not +needed when accepts-є is false, and may not be accurate in +those cases.<BR> +<BR> +Yapps does not compute these sets precisely. Its approximation can +miss certain cases, such as this one: +<PRE CLASS="verbatim"> + rule C: ( A* | B ) + rule B: C [A] +</PRE> +Yapps will calculate <TT>C</TT>'s <SPAN STYLE="font-variant:small-caps">follow</SPAN>set to include <TT>A</TT>. +However, <TT>C</TT> will always match all the <TT>A</TT>'s, so <TT>A</TT> will +never follow it. Yapps 2.0 does not properly handle this construct, +but if it seems important, I may add support for it in a future +version.<BR> +<BR> +Yapps also cannot handle constructs that depend on the calling +sequence. For example: +<PRE CLASS="verbatim"> + rule R: U | 'b' + rule S: | 'c' + rule T: S 'b' + rule U: S 'a' +</PRE> +The <SPAN STYLE="font-variant:small-caps">follow</SPAN>set for <TT>S</TT> includes <TT>a</TT> and <TT>b</TT>. Since <TT>S</TT> can be empty, the <SPAN STYLE="font-variant:small-caps">first</SPAN>set for <TT>S</TT> should include <TT>a</TT>, +<TT>b</TT>, and <TT>c</TT>. However, when parsing <TT>R</TT>, if the lookahead +is <TT>b</TT> we should <EM>not</EM> parse <TT>U</TT>. That's because in <TT>U</TT>, <TT>S</TT> is followed by <TT>a</TT> and not <TT>b</TT>. Therefore in +<TT>R</TT>, we should choose rule <TT>U</TT> only if there is an <TT>a</TT> or +<TT>c</TT>, but not if there is a <TT>b</TT>. Yapps and many other LL(1) +systems do not distinguish <TT>S b</TT> and <TT>S a</TT>, making <TT>S</TT>'s <SPAN STYLE="font-variant:small-caps">follow</SPAN>set <TT>a, b</TT>, and making <TT>R</TT> always try to match +<TT>U</TT>. In this case we can solve the problem by changing <TT>R</TT> to +<CODE>'b' | U</CODE> but it may not always be possible to solve all such +problems in this way.<BR> +<BR> +<!--TOC section Grammar for Parsers--> + +<H2 CLASS="section"><A NAME="htoc20">A</A> Grammar for Parsers</H2><!--SEC END --> + +This is the grammar for parsers, without any Python code mixed in. +The complete grammar can be found in <TT>parsedesc.g</TT> in the Yapps +distribution. +<PRE CLASS="verbatim"> +parser ParserDescription: + ignore: "\\s+" + ignore: "#.*?\r?\n" + token END: "$" # $ means end of string + token ATTR: "<<.+?>>" + token STMT: "{{.+?}}" + token ID: '[a-zA-Z_][a-zA-Z_0-9]*' + token STR: '[rR]?\'([^\\n\'\\\\]|\\\\.)*\'|[rR]?"([^\\n"\\\\]|\\\\.)*"' + + rule Parser: "parser" ID ":" + Options + Tokens + Rules + END + + rule Options: ( "option" ":" STR )* + rule Tokens: ( "token" ID ":" STR | "ignore" ":" STR )* + rule Rules: ( "rule" ID OptParam ":" ClauseA )* + + rule ClauseA: ClauseB ( '[|]' ClauseB )* + rule ClauseB: ClauseC* + rule ClauseC: ClauseD [ '[+]' | '[*]' ] + rule ClauseD: STR | ID [ATTR] | STMT + | '\\(' ClauseA '\\) | '\\[' ClauseA '\\]' +</PRE> +<!--TOC section Upgrading--> + +<H2 CLASS="section"><A NAME="htoc21">B</A> Upgrading</H2><!--SEC END --> + +Yapps 2.0 is not backwards compatible with Yapps 1.0. In this section +are some tips for upgrading: +<OL CLASS="enumerate" type=1><LI CLASS="li-enumerate"> + Yapps 1.0 was distributed as a single file. Yapps 2.0 is + instead distributed as two Python files: a <EM>parser generator</EM> + (26k) and a <EM>parser runtime</EM> (5k). You need both files to + create parsers, but you need only the runtime (<TT>yappsrt.py</TT>) + to use the parsers.<BR> +<BR> +<LI CLASS="li-enumerate">Yapps 1.0 supported Python 1.4 regular expressions from the + <TT>regex</TT> module. Yapps 2.0 uses Python 1.5 regular + expressions from the <TT>re</TT> module. <EM>The new syntax for + regular expressions is not compatible with the old syntax.</EM> + Andrew Kuchling has a guide to converting + regular + expressionshttp://www.python.org/doc/howto/regex-to-re/ on his + web page.<BR> +<BR> +<LI CLASS="li-enumerate">Yapps 1.0 wants a pattern and then a return value in <CODE>-></CODE> + <CODE><<...>></CODE>. Yapps 2.0 allows patterns and Python statements to + be mixed. To convert a rule like this: +<PRE CLASS="verbatim"> +rule R: A B C -> << E1 >> + | X Y Z -> << E2 >> +</PRE>to Yapps 2.0 form, replace the return value specifiers with return + statements: +<PRE CLASS="verbatim"> +rule R: A B C {{ return E1 }} + | X Y Z {{ return E2 }} +</PRE><LI CLASS="li-enumerate">Yapps 2.0 does not perform tail recursion elimination. This + means any recursive rules you write will be turned into recursive + methods in the parser. The parser will work, but may be slower. + It can be made faster by rewriting recursive rules, using instead + the looping operators <CODE>*</CODE> and <CODE>+</CODE> provided in Yapps 2.0.</OL> +<!--TOC section Troubleshooting--> + +<H2 CLASS="section"><A NAME="htoc22">C</A> Troubleshooting</H2><!--SEC END --> + +<UL CLASS="itemize"><LI CLASS="li-itemize"> + A common error is to write a grammar that doesn't have an END + token. End tokens are needed when it is not clear when to stop + parsing. For example, when parsing the expression <TT>3+5</TT>, it is + not clear after reading <TT>3</TT> whether to treat it as a complete + expression or whether the parser should continue reading. + Therefore the grammar for numeric expressions should include an end + token. Another example is the grammar for Lisp expressions. In + Lisp, it is always clear when you should stop parsing, so you do + <EM>not</EM> need an end token. In fact, it may be more useful not + to have an end token, so that you can read in several Lisp expressions. + <LI CLASS="li-itemize">If there is a chance of ambiguity, make sure to put the choices + in the order you want them checked. Usually the most specific + choice should be first. Empty sequences should usually be last. + <LI CLASS="li-itemize">The context sensitive scanner is not appropriate for all + grammars. You might try using the insensitive scanner with the + <TT>context-insensitive-scanner</TT> option in the grammar. + <LI CLASS="li-itemize">If performance turns out to be a problem, try writing a custom + scanner. The Yapps scanner is rather slow (but flexible and easy + to understand). +</UL> +<!--TOC section History--> + +<H2 CLASS="section"><A NAME="htoc23">D</A> History</H2><!--SEC END --> + +Yapps 1 had several limitations that bothered me while writing +parsers: +<OL CLASS="enumerate" type=1><LI CLASS="li-enumerate"> + It was not possible to insert statements into the generated + parser. A common workaround was to write an auxilliary function + that executed those statements, and to call that function as part + of the return value calculation. For example, several of my + parsers had an “append(x,y)” function that existed solely to call + “x.append(y)”. + <LI CLASS="li-enumerate">The way in which grammars were specified was rather + restrictive: a rule was a choice of clauses. Each clause was a + sequence of tokens and rule names, followed by a return value. + <LI CLASS="li-enumerate">Optional matching had to be put into a separate rule because + choices were only made at the beginning of a rule. + <LI CLASS="li-enumerate">Repetition had to be specified in terms of recursion. Not only + was this awkward (sometimes requiring additional rules), I had to + add a tail recursion optimization to Yapps to transform the + recursion back into a loop. +</OL> +Yapps 2 addresses each of these limitations. +<OL CLASS="enumerate" type=1><LI CLASS="li-enumerate"> + Statements can occur anywhere within a rule. (However, only + one-line statements are allowed; multiline blocks marked by + indentation are not.) + <LI CLASS="li-enumerate">Grammars can be specified using any mix of sequences, choices, + tokens, and rule names. To allow for complex structures, + parentheses can be used for grouping. + <LI CLASS="li-enumerate">Given choices and parenthesization, optional matching can be + expressed as a choice between some pattern and nothing. In + addition, Yapps 2 has the convenience syntax <CODE>[A B ...]</CODE> for + matching <CODE>A B ...</CODE> optionally. + <LI CLASS="li-enumerate">Repetition operators <CODE>*</CODE> for zero or more and <CODE>+</CODE> for + one or more make it easy to specify repeating patterns. +</OL> +It is my hope that Yapps 2 will be flexible enough to meet my needs +for another year, yet simple enough that I do not hesitate to use it.<BR> +<BR> +<!--TOC section Debian Extensions--> + +<H2 CLASS="section"><A NAME="htoc24">E</A> Debian Extensions</H2><!--SEC END --> + +<A NAME="sec:debian"></A> +The Debian version adds the following enhancements to the original +Yapps code. They were written by Matthias Urlichs. +<OL CLASS="enumerate" type=1><LI CLASS="li-enumerate"> + Yapps can stack input sources ("include files"). A usage example + is supplied with the calc.g sample program. + <LI CLASS="li-enumerate">Yapps now understands augmented ignore-able patterns. + This means that Yapps can parse multi-line C comments; this wasn't + possible before. + <LI CLASS="li-enumerate">Better error reporting. + <LI CLASS="li-enumerate">Yapps now reads its input incrementally. +</OL> +The generated parser has been renamed to <TT>yapps/runtime.py</TT>. +In Debian, this file is provided by the <TT>yapps2-runtime</TT> package. +You need to depend on it if you Debianize Python programs which use +yapps.<BR> +<BR> +<!--TOC section Future Extensions--> + +<H2 CLASS="section"><A NAME="htoc25">F</A> Future Extensions</H2><!--SEC END --> + +<A NAME="sec:future"></A> +I am still investigating the possibility of LL(2) and higher +lookahead. However, it looks like the resulting parsers will be +somewhat ugly. <BR> +<BR> +It would be nice to control choices with user-defined predicates.<BR> +<BR> +The most likely future extension is backtracking. A grammar pattern +like <CODE>(VAR ':=' expr)? {{ return Assign(VAR,expr) }} : expr {{ return expr }}</CODE> +would turn into code that attempted to match <CODE>VAR ':=' expr</CODE>. If +it succeeded, it would run <CODE>{{ return ... }}</CODE>. If it failed, it +would match <CODE>expr {{ return expr }}</CODE>. Backtracking may make it +less necessary to write LL(2) grammars.<BR> +<BR> +<!--TOC section References--> + +<H2 CLASS="section"><A NAME="htoc26">G</A> References</H2><!--SEC END --> + +<OL CLASS="enumerate" type=1><LI CLASS="li-enumerate"> + The Python-Parser + SIGhttp://www.python.org/sigs/parser-sig/ is the first place + to look for a list of parser systems for Python.<BR> +<BR> +<LI CLASS="li-enumerate">ANTLR/PCCTS, by Terrence Parr, is available at + The ANTLR Home Pagehttp://www.antlr.org/.<BR> +<BR> +<LI CLASS="li-enumerate">PyLR, by Scott Cotton, is at his Starship + pagehttp://starship.skyport.net/crew/scott/PyLR.html.<BR> +<BR> +<LI CLASS="li-enumerate">John Aycock's Compiling Little Languages + Frameworkhttp://www.foretec.com/python/workshops/1998-11/proceedings/papers/aycock-little/aycock-little.html.<BR> +<BR> +<LI CLASS="li-enumerate">PyBison, by Scott Hassan, can be found at + his Python Projects + pagehttp://coho.stanford.edu/~hassan/Python/.<BR> +<BR> +<LI CLASS="li-enumerate">mcf.pars, by Mike C. Fletcher, is available at + his web + pagehttp://members.rogers.com/mcfletch/programming/simpleparse/simpleparse.html.<BR> +<BR> +<LI CLASS="li-enumerate">kwParsing, by Aaron Watters, is available at + his Starship + pagehttp://starship.skyport.net/crew/aaron_watters/kwParsing/. +</OL> +<!--BEGIN NOTES document--> +<HR WIDTH="50%" SIZE=1><DL CLASS="list"><DT CLASS="dt-list"><A NAME="note1" HREF="#text1"><FONT SIZE=5>1</FONT></A><DD CLASS="dd-list">When using a context-sensitive scanner, the parser tells the scanner what the valid token types are at each point. If you call <TT>token</TT> before the parser can tell the scanner the valid token types, the scanner will attempt to match without considering the context. +</DL> +<!--END NOTES--> +<!--HTMLFOOT--> +<!--ENDHTML--> +<!--FOOTER--> +<HR SIZE=2><BLOCKQUOTE CLASS="quote"><EM>This document was translated from L<sup>A</sup>T<sub>E</sub>X by +</EM><A HREF="http://pauillac.inria.fr/~maranget/hevea/index.html"><EM>H<FONT SIZE=2><sup>E</sup></FONT>V<FONT SIZE=2><sup>E</sup></FONT>A</EM></A><EM>.</EM></BLOCKQUOTE></BODY> +</HTML> diff --git a/doc/yapps2.htoc b/doc/yapps2.htoc new file mode 100644 index 0000000..9fabbc6 --- /dev/null +++ b/doc/yapps2.htoc @@ -0,0 +1,36 @@ +\begin{tocenv} +\tocitem \@locref{htoc1}{\begin{@norefs}\@print{1}\quad{}Introduction\end{@norefs}} +\tocitem \@locref{htoc2}{\begin{@norefs}\@print{2}\quad{}Examples\end{@norefs}} +\begin{tocenv} +\tocitem \@locref{htoc3}{\begin{@norefs}\@print{2.1}\quad{}Introduction to Grammars\end{@norefs}} +\tocitem \@locref{htoc4}{\begin{@norefs}\@print{2.2}\quad{}Lisp Expressions\end{@norefs}} +\tocitem \@locref{htoc5}{\begin{@norefs}\@print{2.3}\quad{}Calculator\end{@norefs}} +\tocitem \@locref{htoc6}{\begin{@norefs}\@print{2.4}\quad{}Calculator with Memory\end{@norefs}} +\end{tocenv} +\tocitem \@locref{htoc7}{\begin{@norefs}\@print{3}\quad{}Grammars\end{@norefs}} +\begin{tocenv} +\tocitem \@locref{htoc8}{\begin{@norefs}\@print{3.1}\quad{}Left Factoring\end{@norefs}} +\tocitem \@locref{htoc9}{\begin{@norefs}\@print{3.2}\quad{}Left Recursion\end{@norefs}} +\tocitem \@locref{htoc10}{\begin{@norefs}\@print{3.3}\quad{}Ambiguous Grammars\end{@norefs}} +\end{tocenv} +\tocitem \@locref{htoc11}{\begin{@norefs}\@print{4}\quad{}Customization\end{@norefs}} +\begin{tocenv} +\tocitem \@locref{htoc12}{\begin{@norefs}\@print{4.1}\quad{}Customizing Parsers\end{@norefs}} +\tocitem \@locref{htoc13}{\begin{@norefs}\@print{4.2}\quad{}Customizing Scanners\end{@norefs}} +\end{tocenv} +\tocitem \@locref{htoc14}{\begin{@norefs}\@print{5}\quad{}Parser Mechanics\end{@norefs}} +\begin{tocenv} +\tocitem \@locref{htoc15}{\begin{@norefs}\@print{5.1}\quad{}Parser Objects\end{@norefs}} +\tocitem \@locref{htoc16}{\begin{@norefs}\@print{5.2}\quad{}Context Sensitive Scanner\end{@norefs}} +\tocitem \@locref{htoc17}{\begin{@norefs}\@print{5.3}\quad{}Internal Variables\end{@norefs}} +\tocitem \@locref{htoc18}{\begin{@norefs}\@print{5.4}\quad{}Pre- and Post-Parser Code\end{@norefs}} +\tocitem \@locref{htoc19}{\begin{@norefs}\@print{5.5}\quad{}Representation of Grammars\end{@norefs}} +\end{tocenv} +\tocitem \@locref{htoc20}{\begin{@norefs}\@print{A}\quad{}Grammar for Parsers\end{@norefs}} +\tocitem \@locref{htoc21}{\begin{@norefs}\@print{B}\quad{}Upgrading\end{@norefs}} +\tocitem \@locref{htoc22}{\begin{@norefs}\@print{C}\quad{}Troubleshooting\end{@norefs}} +\tocitem \@locref{htoc23}{\begin{@norefs}\@print{D}\quad{}History\end{@norefs}} +\tocitem \@locref{htoc24}{\begin{@norefs}\@print{E}\quad{}Debian Extensions\end{@norefs}} +\tocitem \@locref{htoc25}{\begin{@norefs}\@print{F}\quad{}Future Extensions\end{@norefs}} +\tocitem \@locref{htoc26}{\begin{@norefs}\@print{G}\quad{}References\end{@norefs}} +\end{tocenv} diff --git a/doc/yapps2.tex b/doc/yapps2.tex new file mode 100644 index 0000000..31b44be --- /dev/null +++ b/doc/yapps2.tex @@ -0,0 +1,1246 @@ +\documentclass[10pt]{article} +\usepackage{palatino} +\usepackage{html} +\usepackage{color} + +\setlength{\headsep}{0in} +\setlength{\headheight}{0in} +\setlength{\textheight}{8.5in} +\setlength{\textwidth}{5.9in} +\setlength{\oddsidemargin}{0.25in} + +\definecolor{darkblue}{rgb}{0,0,0.6} +\definecolor{darkerblue}{rgb}{0,0,0.3} + +%% \newcommand{\mysection}[1]{\section{\textcolor{darkblue}{#1}}} +%% \newcommand{\mysubsection}[1]{\subsection{\textcolor{darkerblue}{#1}}} +\newcommand{\mysection}[1]{\section{#1}} +\newcommand{\mysubsection}[1]{\subsection{#1}} + +\bodytext{bgcolor=white text=black link=#004080 vlink=#006020} + +\newcommand{\first}{\textsc{first}} +\newcommand{\follow}{\textsc{follow}} + +\begin{document} + +\begin{center} +\hfill \begin{tabular}{c} +{\Large The \emph{Yapps} Parser Generator System}\\ +\verb|http://theory.stanford.edu/~amitp/Yapps/|\\ + Version 2\\ +\\ +Amit J. Patel\\ +\htmladdnormallink{http://www-cs-students.stanford.edu/~amitp/} +{http://www-cs-students.stanford.edu/~amitp/} + +\end{tabular} \hfill \rule{0in}{0in} +\end{center} + +\mysection{Introduction} + +\emph{Yapps} (\underline{Y}et \underline{A}nother \underline{P}ython +\underline{P}arser \underline{S}ystem) is an easy to use parser +generator that is written in Python and generates Python code. There +are several parser generator systems already available for Python, +including \texttt{PyLR, kjParsing, PyBison,} and \texttt{mcf.pars,} +but I had different goals for my parser. Yapps is simple, is easy to +use, and produces human-readable parsers. It is not the fastest or +most powerful parser. Yapps is designed to be used when regular +expressions are not enough and other parser systems are too much: +situations where you may write your own recursive descent parser. + +Some unusual features of Yapps that may be of interest are: + +\begin{enumerate} + + \item Yapps produces recursive descent parsers that are readable by + humans, as opposed to table-driven parsers that are difficult to + read. A Yapps parser for a simple calculator looks similar to the + one that Mark Lutz wrote by hand for \emph{Programming Python.} + + \item Yapps also allows for rules that accept parameters and pass + arguments to be used while parsing subexpressions. Grammars that + allow for arguments to be passed to subrules and for values to be + passed back are often called \emph{attribute grammars.} In many + cases parameterized rules can be used to perform actions at ``parse + time'' that are usually delayed until later. For example, + information about variable declarations can be passed into the + rules that parse a procedure body, so that undefined variables can + be detected at parse time. The types of defined variables can be + used in parsing as well---for example, if the type of {\tt X} is + known, we can determine whether {\tt X(1)} is an array reference or + a function call. + + \item Yapps grammars are fairly easy to write, although there are + some inconveniences having to do with ELL(1) parsing that have to be + worked around. For example, rules have to be left factored and + rules may not be left recursive. However, neither limitation seems + to be a problem in practice. + + Yapps grammars look similar to the notation used in the Python + reference manual, with operators like \verb:*:, \verb:+:, \verb:|:, + \verb:[]:, and \verb:(): for patterns, names ({\tt tim}) for rules, + regular expressions (\verb:"[a-z]+":) for tokens, and \verb:#: for + comments. + + \item The Yapps parser generator is written as a single Python module + with no C extensions. Yapps produces parsers that are written + entirely in Python, and require only the Yapps run-time module (5k) + for support. + + \item Yapps's scanner is context-sensitive, picking tokens based on + the types of the tokens accepted by the parser. This can be + helpful when implementing certain kinds of parsers, such as for a + preprocessor. + +\end{enumerate} + +There are several disadvantages of using Yapps over another parser system: + +\begin{enumerate} + + \item Yapps parsers are \texttt{ELL(1)} (Extended LL(1)), which is + less powerful than \texttt{LALR} (used by \texttt{PyLR}) or + \texttt{SLR} (used by \texttt{kjParsing}), so Yapps would not be a + good choice for parsing complex languages. For example, allowing + both \texttt{x := 5;} and \texttt{x;} as statements is difficult + because we must distinguish based on only one token of lookahead. + Seeing only \texttt{x}, we cannot decide whether we have an + assignment statement or an expression statement. (Note however + that this kind of grammar can be matched with backtracking; see + section \ref{sec:future}.) + + \item The scanner that Yapps provides can only read from strings, not + files, so an entire file has to be read in before scanning can + begin. It is possible to build a custom scanner, though, so in + cases where stream input is needed (from the console, a network, or + a large file are examples), the Yapps parser can be given a custom + scanner that reads from a stream instead of a string. + + \item Yapps is not designed with efficiency in mind. + +\end{enumerate} + +Yapps provides an easy to use parser generator that produces parsers +similar to what you might write by hand. It is not meant to be a +solution for all parsing problems, but instead an aid for those times +you would write a parser by hand rather than using one of the more +powerful parsing packages available. + +Yapps 2.0 is easier to use than Yapps 1.0. New features include a +less restrictive input syntax, which allows mixing of sequences, +choices, terminals, and nonterminals; optional matching; the ability +to insert single-line statements into the generated parser; and +looping constructs \verb|*| and \verb|+| similar to the repetitive +matching constructs in regular expressions. Unfortunately, the +addition of these constructs has made Yapps 2.0 incompatible with +Yapps 1.0, so grammars will have to be rewritten. See section +\ref{sec:Upgrading} for tips on changing Yapps 1.0 grammars for use +with Yapps 2.0. + +\mysection{Examples} + +In this section are several examples that show the use of Yapps. +First, an introduction shows how to construct grammars and write them +in Yapps form. This example can be skipped by someone familiar with +grammars and parsing. Next is a Lisp expression grammar that produces +a parse tree as output. This example demonstrates the use of tokens +and rules, as well as returning values from rules. The third example +is a expression evaluation grammar that evaluates during parsing +(instead of producing a parse tree). + +\mysubsection{Introduction to Grammars} + +A \emph{grammar} for a natural language specifies how words can be put +together to form large structures, such as phrases and sentences. A +grammar for a computer language is similar in that it specifies how +small components (called \emph{tokens}) can be put together to form +larger structures. In this section we will write a grammar for a tiny +subset of English. + +Simple English sentences can be described as being a noun phrase +followed by a verb followed by a noun phrase. For example, in the +sentence, ``Jack sank the blue ship,'' the word ``Jack'' is the first +noun phrase, ``sank'' is the verb, and ``the blue ship'' is the second +noun phrase. In addition we should say what a noun phrase is; for +this example we shall say that a noun phrase is an optional article +(a, an, the) followed by any number of adjectives followed by a noun. +The tokens in our language are the articles, nouns, verbs, and +adjectives. The \emph{rules} in our language will tell us how to +combine the tokens together to form lists of adjectives, noun phrases, +and sentences: + +\begin{itemize} + \item \texttt{sentence: noun\_phrase verb noun\_phrase} + \item \texttt{noun\_phrase: [article] adjective* noun} +\end{itemize} + +Notice that some things that we said easily in English, such as +``optional article,'' are expressed using special syntax, such as +brackets. When we said, ``any number of adjectives,'' we wrote +\texttt{adjective*}, where the \texttt{*} means ``zero or more of the +preceding pattern''. + +The grammar given above is close to a Yapps grammar. We also have to +specify what the tokens are, and what to do when a pattern is matched. +For this example, we will do nothing when patterns are matched; the +next example will explain how to perform match actions. + +\begin{verbatim} +parser TinyEnglish: + ignore: "\\W+" + token noun: "(Jack|spam|ship)" + token verb: "(sank|threw)" + token article: "(a|an|the)" + token adjective: "(blue|red|green)" + + rule sentence: noun_phrase verb noun_phrase + rule noun_phrase: [article] adjective* noun +\end{verbatim} + +The tokens are specified as Python \emph{regular expressions}. Since +Yapps produces Python code, you can write any regular expression that +would be accepted by Python. (\emph{Note:} These are Python 1.5 +regular expressions from the \texttt{re} module, not Python 1.4 +regular expressions from the \texttt{regex} module.) In addition to +tokens that you want to see (which are given names), you can also +specify tokens to ignore, marked by the \texttt{ignore} keyword. In +this parser we want to ignore whitespace. + +The TinyEnglish grammar shows how you define tokens and rules, but it +does not specify what should happen once we've matched the rules. In +the next example, we will take a grammar and produce a \emph{parse +tree} from it. + +\mysubsection{Lisp Expressions} + +Lisp syntax, although hated by many, has a redeeming quality: it is +simple to parse. In this section we will construct a Yapps grammar to +parse Lisp expressions and produce a parse tree as output. + +\subsubsection*{Defining the Grammar} + +The syntax of Lisp is simple. It has expressions, which are +identifiers, strings, numbers, and lists. A list is a left +parenthesis followed by some number of expressions (separated by +spaces) followed by a right parenthesis. For example, \verb|5|, +\verb|"ni"|, and \verb|(print "1+2 = " (+ 1 2))| are Lisp expressions. +Written as a grammar, + +\begin{verbatim} + expr: ID | STR | NUM | list + list: ( expr* ) +\end{verbatim} + +In addition to having a grammar, we need to specify what to do every +time something is matched. For the tokens, which are strings, we just +want to get the ``value'' of the token, attach its type (identifier, +string, or number) in some way, and return it. For the lists, we want +to construct and return a Python list. + +Once some pattern is matched, we enclose a return statement enclosed +in \verb|{{...}}|. The braces allow us to insert any one-line +statement into the parser. Within this statement, we can refer to the +values returned by matching each part of the rule. After matching a +token such as \texttt{ID}, ``ID'' will be bound to the text of the +matched token. Let's take a look at the rule: + +\begin{verbatim} + rule expr: ID {{ return ('id', ID) }} + ... +\end{verbatim} + +In a rule, tokens return the text that was matched. For identifiers, +we just return the identifier, along with a ``tag'' telling us that +this is an identifier and not a string or some other value. Sometimes +we may need to convert this text to a different form. For example, if +a string is matched, we want to remove quotes and handle special forms +like \verb|\n|. If a number is matched, we want to convert it into a +number. Let's look at the return values for the other tokens: + +\begin{verbatim} + ... + | STR {{ return ('str', eval(STR)) }} + | NUM {{ return ('num', atoi(NUM)) }} + ... +\end{verbatim} + +If we get a string, we want to remove the quotes and process any +special backslash codes, so we run \texttt{eval} on the quoted string. +If we get a number, we convert it to an integer with \texttt{atoi} and +then return the number along with its type tag. + +For matching a list, we need to do something slightly more +complicated. If we match a Lisp list of expressions, we want to +create a Python list with those values. + +\begin{verbatim} + rule list: "\\(" # Match the opening parenthesis + {{ result = [] }} # Create a Python list + ( + expr # When we match an expression, + {{ result.append(expr) }} # add it to the list + )* # * means repeat this if needed + "\\)" # Match the closing parenthesis + {{ return result }} # Return the Python list +\end{verbatim} + +In this rule we first match the opening parenthesis, then go into a +loop. In this loop we match expressions and add them to the list. +When there are no more expressions to match, we match the closing +parenthesis and return the resulting. Note that \verb:#: is used for +comments, just as in Python. + +The complete grammar is specified as follows: +\begin{verbatim} +parser Lisp: + ignore: '\\s+' + token NUM: '[0-9]+' + token ID: '[-+*/!@%^&=.a-zA-Z0-9_]+' + token STR: '"([^\\"]+|\\\\.)*"' + + rule expr: ID {{ return ('id', ID) }} + | STR {{ return ('str', eval(STR)) }} + | NUM {{ return ('num', atoi(NUM)) }} + | list {{ return list }} + rule list: "\\(" {{ result = [] }} + ( expr {{ result.append(expr) }} + )* + "\\)" {{ return result }} +\end{verbatim} + +One thing you may have noticed is that \verb|"\\("| and \verb|"\\)"| +appear in the \texttt{list} rule. These are \emph{inline tokens}: +they appear in the rules without being given a name with the +\texttt{token} keyword. Inline tokens are more convenient to use, but +since they do not have a name, the text that is matched cannot be used +in the return value. They are best used for short simple patterns +(usually punctuation or keywords). + +Another thing to notice is that the number and identifier tokens +overlap. For example, ``487'' matches both NUM and ID. In Yapps, the +scanner only tries to match tokens that are acceptable to the parser. +This rule doesn't help here, since both NUM and ID can appear in the +same place in the grammar. There are two rules used to pick tokens if +more than one matches. One is that the \emph{longest} match is +preferred. For example, ``487x'' will match as an ID (487x) rather +than as a NUM (487) followed by an ID (x). The second rule is that if +the two matches are the same length, the \emph{first} one listed in +the grammar is preferred. For example, ``487'' will match as an NUM +rather than an ID because NUM is listed first in the grammar. Inline +tokens have preference over any tokens you have listed. + +Now that our grammar is defined, we can run Yapps to produce a parser, +and then run the parser to produce a parse tree. + +\subsubsection*{Running Yapps} + +In the Yapps module is a function \texttt{generate} that takes an +input filename and writes a parser to another file. We can use this +function to generate the Lisp parser, which is assumed to be in +\texttt{lisp.g}. + +\begin{verbatim} +% python +Python 1.5.1 (#1, Sep 3 1998, 22:51:17) [GCC 2.7.2.3] on linux-i386 +Copyright 1991-1995 Stichting Mathematisch Centrum, Amsterdam +>>> import yapps +>>> yapps.generate('lisp.g') +\end{verbatim} + +At this point, Yapps has written a file \texttt{lisp.py} that contains +the parser. In that file are two classes (one scanner and one parser) +and a function (called \texttt{parse}) that puts things together for +you. + +Alternatively, we can run Yapps from the command line to generate the +parser file: + +\begin{verbatim} +% python yapps.py lisp.g +\end{verbatim} + +After running Yapps either from within Python or from the command +line, we can use the Lisp parser by calling the \texttt{parse} +function. The first parameter should be the rule we want to match, +and the second parameter should be the string to parse. + +\begin{verbatim} +>>> import lisp +>>> lisp.parse('expr', '(+ 3 4)') +[('id', '+'), ('num', 3), ('num', 4)] +>>> lisp.parse('expr', '(print "3 = " (+ 1 2))') +[('id', 'print'), ('str', '3 = '), [('id', '+'), ('num', 1), ('num', 2)]] +\end{verbatim} + +The \texttt{parse} function is not the only way to use the parser; +section \ref{sec:Parser-Objects} describes how to access parser objects +directly. + +We've now gone through the steps in creating a grammar, writing a +grammar file for Yapps, producing a parser, and using the parser. In +the next example we'll see how rules can take parameters and also how +to do computations instead of just returning a parse tree. + +\mysubsection{Calculator} + +A common example parser given in many textbooks is that for simple +expressions, with numbers, addition, subtraction, multiplication, +division, and parenthesization of subexpressions. We'll write this +example in Yapps, evaluating the expression as we parse. + +Unlike \texttt{yacc}, Yapps does not have any way to specify +precedence rules, so we have to do it ourselves. We say that an +expression is the sum of terms, and that a term is the product of +factors, and that a factor is a number or a parenthesized expression: + +\begin{verbatim} + expr: factor ( ("+"|"-") factor )* + factor: term ( ("*"|"/") term )* + term: NUM | "(" expr ")" +\end{verbatim} + +In order to evaluate the expression as we go, we should keep along an +accumulator while evaluating the lists of terms or factors. Just as +we kept a ``result'' variable to build a parse tree for Lisp +expressions, we will use a variable to evaluate numerical +expressions. The full grammar is given below: + +\begin{verbatim} +parser Calculator: + token END: "$" # $ means end of string + token NUM: "[0-9]+" + + rule goal: expr END {{ return expr }} + + # An expression is the sum and difference of factors + rule expr: factor {{ v = factor }} + ( "[+]" factor {{ v = v+factor }} + | "-" factor {{ v = v-factor }} + )* {{ return v }} + + # A factor is the product and division of terms + rule factor: term {{ v = term }} + ( "[*]" term {{ v = v*term }} + | "/" term {{ v = v/term }} + )* {{ return v }} + + # A term is either a number or an expression surrounded by parentheses + rule term: NUM {{ return atoi(NUM) }} + | "\\(" expr "\\)" {{ return expr }} +\end{verbatim} + +The top-level rule is \emph{goal}, which says that we are looking for +an expression followed by the end of the string. The \texttt{END} +token is needed because without it, it isn't clear when to stop +parsing. For example, the string ``1+3'' could be parsed either as +the expression ``1'' followed by the string ``+3'' or it could be +parsed as the expression ``1+3''. By requiring expressions to end +with \texttt{END}, the parser is forced to take ``1+3''. + +In the two rules with repetition, the accumulator is named \texttt{v}. +After reading in one expression, we initialize the accumulator. Each +time through the loop, we modify the accumulator by adding, +subtracting, multiplying by, or dividing the previous accumulator by +the expression that has been parsed. At the end of the rule, we +return the accumulator. + +The calculator example shows how to process lists of elements using +loops, as well as how to handle precedence of operators. + +\emph{Note:} It's often important to put the \texttt{END} token in, so +put it in unless you are sure that your grammar has some other +non-ambiguous token marking the end of the program. + +\mysubsection{Calculator with Memory} + +In the previous example we learned how to write a calculator that +evaluates simple numerical expressions. In this section we will +extend the example to support both local and global variables. + +To support global variables, we will add assignment statements to the +``goal'' rule. + +\begin{verbatim} + rule goal: expr END {{ return expr }} + | 'set' ID expr END {{ global_vars[ID] = expr }} + {{ return expr }} +\end{verbatim} + +To use these variables, we need a new kind of terminal: + +\begin{verbatim} + rule term: ... | ID {{ return global_vars[ID] }} +\end{verbatim} + +So far, these changes are straightforward. We simply have a global +dictionary \texttt{global\_vars} that stores the variables and values, +we modify it when there is an assignment statement, and we look up +variables in it when we see a variable name. + +To support local variables, we will add variable declarations to the +set of allowed expressions. + +\begin{verbatim} + rule term: ... | 'let' VAR '=' expr 'in' expr ... +\end{verbatim} + +This is where it becomes tricky. Local variables should be stored in +a local dictionary, not in the global one. One trick would be to save +a copy of the global dictionary, modify it, and then restore it +later. In this example we will instead use \emph{attributes} to +create local information and pass it to subrules. + +A rule can optionally take parameters. When we invoke the rule, we +must pass in arguments. For local variables, let's use a single +parameter, \texttt{local\_vars}: + +\begin{verbatim} + rule expr<<local_vars>>: ... + rule factor<<local_vars>>: ... + rule term<<local_vars>>: ... +\end{verbatim} + +Each time we want to match \texttt{expr}, \texttt{factor}, or +\texttt{term}, we will pass the local variables in the current rule to +the subrule. One interesting case is when we pass as an argument +something \emph{other} than \texttt{local\_vars}: + +\begin{verbatim} + rule term<<local_vars>>: ... + | 'let' VAR '=' expr<<local_vars>> + {{ local_vars = [(VAR, expr)] + local_vars }} + 'in' expr<<local_vars>> + {{ return expr }} +\end{verbatim} + +Note that the assignment to the local variables list does not modify +the original list. This is important to keep local variables from +being seen outside the ``let''. + +The other interesting case is when we find a variable: + +\begin{verbatim} +global_vars = {} + +def lookup(map, name): + for x,v in map: if x==name: return v + return global_vars[name] +%% + ... + rule term<<local_vars>: ... + | VAR {{ return lookup(local_vars, VAR) }} +\end{verbatim} + +The lookup function will search through the local variable list, and +if it cannot find the name there, it will look it up in the global +variable dictionary. + +A complete grammar for this example, including a read-eval-print loop +for interacting with the calculator, can be found in the examples +subdirectory included with Yapps. + +In this section we saw how to insert code before the parser. We also +saw how to use attributes to transmit local information from one rule +to its subrules. + +\mysection{Grammars} + +Each Yapps grammar has a name, a list of tokens, and a set of +production rules. A grammar named \texttt{X} will be used to produce +a parser named \texttt{X} and a scanner anmed \texttt{XScanner}. As +in Python, names are case sensitive, start with a letter, and contain +letters, numbers, and underscores (\_). + +There are three kinds of tokens in Yapps: named, inline, and ignored. +As their name implies, named tokens are given a name, using the token +construct: \texttt{token \emph{name} : \emph{regexp}}. In a rule, the +token can be matched by using the name. Inline tokens are regular +expressions that are used in rules without being declared. Ignored +tokens are declared using the ignore construct: \texttt{ignore: + \emph{regexp}}. These tokens are ignored by the scanner, and are +not seen by the parser. Often whitespace is an ignored token. The +regular expressions used to define tokens should use the syntax +defined in the \texttt{re} module, so some symbols may have to be +backslashed. + +Production rules in Yapps have a name and a pattern to match. If the +rule is parameterized, the name should be followed by a list of +parameter names in \verb|<<...>>|. A pattern can be a simple pattern +or a compound pattern. Simple patterns are the name of a named token, +a regular expression in quotes (inline token), the name of a +production rule (followed by arguments in \verb|<<...>>|, if the rule +has parameters), and single line Python statements (\verb|{{...}}|). +Compound patterns are sequences (\verb|A B C ...|), choices ( +\verb:A | B | C | ...:), options (\verb|[...]|), zero-or-more repetitions +(\verb|...*|), and one-or-more repetitions (\verb|...+|). Like +regular expressions, repetition operators have a higher precedence +than sequences, and sequences have a higher precedence than choices. + +Whenever \verb|{{...}}| is used, a legal one-line Python statement +should be put inside the braces. The token \verb|}}| should not +appear within the \verb|{{...}}| section, even within a string, since +Yapps does not attempt to parse the Python statement. A workaround +for strings is to put two strings together (\verb|"}" "}"|), or to use +backslashes (\verb|"}\}"|). At the end of a rule you should use a +\verb|{{ return X }}| statement to return a value. However, you +should \emph{not} use any control statements (\texttt{return}, +\texttt{continue}, \texttt{break}) in the middle of a rule. Yapps +needs to make assumptions about the control flow to generate a parser, +and any changes to the control flow will confuse Yapps. + +The \verb|<<...>>| form can occur in two places: to define parameters +to a rule and to give arguments when matching a rule. Parameters use +the syntax used for Python functions, so they can include default +arguments and the special forms (\verb|*args| and \verb|**kwargs|). +Arguments use the syntax for Python function call arguments, so they +can include normal arguments and keyword arguments. The token +\verb|>>| should not appear within the \verb|<<...>>| section. + +In both the statements and rule arguments, you can use names defined +by the parser to refer to matched patterns. You can refer to the text +matched by a named token by using the token name. You can use the +value returned by a production rule by using the name of that rule. +If a name \texttt{X} is matched more than once (such as in loops), you +will have to save the earlier value(s) in a temporary variable, and +then use that temporary variable in the return value. The next +section has an example of a name that occurs more than once. + +\mysubsection{Left Factoring} +\label{sec:Left-Factoring} + +Yapps produces ELL(1) parsers, which determine which clause to match +based on the first token available. Sometimes the leftmost tokens of +several clauses may be the same. The classic example is the +\emph{if/then/else} construct in Pascal: + +\begin{verbatim} +rule stmt: "if" expr "then" stmt {{ then_part = stmt }} + "else" stmt {{ return ('If',expr,then_part,stmt) }} + | "if" expr "then" stmt {{ return ('If',expr,stmt,[]) }} +\end{verbatim} + +(Note that we have to save the first \texttt{stmt} into a variable +because there is another \texttt{stmt} that will be matched.) The +left portions of the two clauses are the same, which presents a +problem for the parser. The solution is \emph{left-factoring}: the +common parts are put together, and \emph{then} a choice is made about +the remaining part: + +\begin{verbatim} +rule stmt: "if" expr + "then" stmt {{ then_part = stmt }} + {{ else_part = [] }} + [ "else" stmt {{ else_part = stmt }} ] + {{ return ('If', expr, then_part, else_part) }} +\end{verbatim} + +Unfortunately, the classic \emph{if/then/else} situation is +\emph{still} ambiguous when you left-factor. Yapps can deal with this +situation, but will report a warning; see section +\ref{sec:Ambiguous-Grammars} for details. + +In general, replace rules of the form: + +\begin{verbatim} +rule A: a b1 {{ return E1 }} + | a b2 {{ return E2 }} + | c3 {{ return E3 }} + | c4 {{ return E4 }} +\end{verbatim} + +with rules of the form: + +\begin{verbatim} +rule A: a ( b1 {{ return E1 }} + | b2 {{ return E2 }} + ) + | c3 {{ return E3 }} + | c4 {{ return E4 }} +\end{verbatim} + +\mysubsection{Left Recursion} + +A common construct in grammars is for matching a list of patterns, +sometimes separated with delimiters such as commas or semicolons. In +LR-based parser systems, we can parse a list with something like this: + +\begin{verbatim} +rule sum: NUM {{ return NUM }} + | sum "+" NUM {{ return (sum, NUM) }} +\end{verbatim} + +Parsing \texttt{1+2+3+4} would produce the output +\texttt{(((1,2),3),4)}, which is what we want from a left-associative +addition operator. Unfortunately, this grammar is \emph{left +recursive,} because the \texttt{sum} rule contains a clause that +begins with \texttt{sum}. (The recursion occurs at the left side of +the clause.) + +We must restructure this grammar to be \emph{right recursive} instead: + +\begin{verbatim} +rule sum: NUM {{ return NUM }} + | NUM "+" sum {{ return (NUM, sum) }} +\end{verbatim} + +Unfortunately, using this grammar, \texttt{1+2+3+4} would be parsed as +\texttt{(1,(2,(3,4)))}, which no longer follows left associativity. +The rule also needs to be left-factored. Instead, we write the +pattern as a loop instead: + +\begin{verbatim} +rule sum: NUM {{ v = NUM }} + ( "[+]" NUM {{ v = (v,NUM) }} )* + {{ return v }} +\end{verbatim} + +In general, replace rules of the form: + +\begin{verbatim} +rule A: A a1 -> << E1 >> + | A a2 -> << E2 >> + | b3 -> << E3 >> + | b4 -> << E4 >> +\end{verbatim} + +with rules of the form: + +\begin{verbatim} +rule A: ( b3 {{ A = E3 }} + | b4 {{ A = E4 }} ) + ( a1 {{ A = E1 }} + | a2 {{ A = E2 }} )* + {{ return A }} +\end{verbatim} + +We have taken a rule that proved problematic for with recursion and +turned it into a rule that works well with looping constructs. + +\mysubsection{Ambiguous Grammars} +\label{sec:Ambiguous-Grammars} + +In section \ref{sec:Left-Factoring} we saw the classic if/then/else +ambiguity, which occurs because the ``else \ldots'' portion of an ``if +\ldots then \ldots else \ldots'' construct is optional. Programs with +nested if/then/else constructs can be ambiguous when one of the else +clauses is missing: +\begin{verbatim} +if 1 then if 1 then + if 5 then if 5 then + x := 1; x := 1; + else else + y := 9; y := 9; +\end{verbatim} + +The indentation shows that the program can be parsed in two different +ways. (Of course, if we all would adopt Python's indentation-based +structuring, this would never happen!) Usually we want the parsing on +the left: the ``else'' should be associated with the closest ``if'' +statement. In section \ref{sec:Left-Factoring} we ``solved'' the +problem by using the following grammar: + +\begin{verbatim} +rule stmt: "if" expr + "then" stmt {{ then_part = stmt }} + {{ else_part = [] }} + [ "else" stmt {{ else_part = stmt }} ] + {{ return ('If', expr, then_part, else_part) }} +\end{verbatim} + +Here, we have an optional match of ``else'' followed by a statement. +The ambiguity is that if an ``else'' is present, it is not clear +whether you want it parsed immediately or if you want it to be parsed +by the outer ``if''. + +Yapps will deal with the situation by matching when the else pattern +when it can. The parser will work in this case because it prefers the +\emph{first} matching clause, which tells Yapps to parse the ``else''. +That is exactly what we want! + +For ambiguity cases with choices, Yapps will choose the \emph{first} +matching choice. However, remember that Yapps only looks at the first +token to determine its decision, so {\tt (a b | a c)} will result in +Yapps choosing {\tt a b} even when the input is {\tt a c}. It only +looks at the first token, {\tt a}, to make its decision. + +\mysection{Customization} + +Both the parsers and the scanners can be customized. The parser is +usually extended by subclassing, and the scanner can either be +subclassed or completely replaced. + +\mysubsection{Customizing Parsers} + +If additional fields and methods are needed in order for a parser to +work, Python subclassing can be used. (This is unlike parser classes +written in static languages, in which these fields and methods must be +defined in the generated parser class.) We simply subclass the +generated parser, and add any fields or methods required. Expressions +in the grammar can call methods of the subclass to perform any actions +that cannot be expressed as a simple expression. For example, +consider this simple grammar: + +\begin{verbatim} +parser X: + rule goal: "something" {{ self.printmsg() }} +\end{verbatim} + +The \texttt{printmsg} function need not be implemented in the parser +class \texttt{X}; it can be implemented in a subclass: + +\begin{verbatim} +import Xparser + +class MyX(Xparser.X): + def printmsg(self): + print "Hello!" +\end{verbatim} + +\mysubsection{Customizing Scanners} + +The generated parser class is not dependent on the generated scanner +class. A scanner object is passed to the parser object's constructor +in the \texttt{parse} function. To use a different scanner, write +your own function to construct parser objects, with an instance of a +different scanner. Scanner objects must have a \texttt{token} method +that accepts an integer \texttt{N} as well as a list of allowed token +types, and returns the Nth token, as a tuple. The default scanner +raises \texttt{NoMoreTokens} if no tokens are available, and +\texttt{SyntaxError} if no token could be matched. However, the +parser does not rely on these exceptions; only the \texttt{parse} +convenience function (which calls \texttt{wrap\_error\_reporter}) and +the \texttt{print\_error} error display function use those exceptions. + +The tuples representing tokens have four elements. The first two are +the beginning and ending indices of the matched text in the input +string. The third element is the type tag, matching either the name +of a named token or the quoted regexp of an inline or ignored token. +The fourth element of the token tuple is the matched text. If the +input string is \texttt{s}, and the token tuple is +\texttt{(b,e,type,val)}, then \texttt{val} should be equal to +\texttt{s[b:e]}. + +The generated parsers do not the beginning or ending index. They use +only the token type and value. However, the default error reporter +uses the beginning and ending index to show the user where the error +is. + +\mysection{Parser Mechanics} + +The base parser class (Parser) defines two methods, \texttt{\_scan} +and \texttt{\_peek}, and two fields, \texttt{\_pos} and +\texttt{\_scanner}. The generated parser inherits from the base +parser, and contains one method for each rule in the grammar. To +avoid name clashes, do not use names that begin with an underscore +(\texttt{\_}). + +\mysubsection{Parser Objects} +\label{sec:Parser-Objects} + +Yapps produces as output two exception classes, a scanner class, a +parser class, and a function \texttt{parse} that puts everything +together. The \texttt{parse} function does not have to be used; +instead, one can create a parser and scanner object and use them +together for parsing. + +\begin{verbatim} + def parse(rule, text): + P = X(XScanner(text)) + return wrap_error_reporter(P, rule) +\end{verbatim} + +The \texttt{parse} function takes a name of a rule and an input string +as input. It creates a scanner and parser object, then calls +\texttt{wrap\_error\_reporter} to execute the method in the parser +object named \texttt{rule}. The wrapper function will call the +appropriate parser rule and report any parsing errors to standard +output. + +There are several situations in which the \texttt{parse} function +would not be useful. If a different parser or scanner is being used, +or exceptions are to be handled differently, a new \texttt{parse} +function would be required. The supplied \texttt{parse} function can +be used as a template for writing a function for your own needs. An +example of a custom parse function is the \texttt{generate} function +in \texttt{Yapps.py}. + +\mysubsection{Context Sensitive Scanner} + +Unlike most scanners, the scanner produced by Yapps can take into +account the context in which tokens are needed, and try to match only +good tokens. For example, in the grammar: + +\begin{verbatim} +parser IniFile: + token ID: "[a-zA-Z_0-9]+" + token VAL: ".*" + + rule pair: ID "[ \t]*=[ \t]*" VAL "\n" +\end{verbatim} + +we would like to scan lines of text and pick out a name/value pair. +In a conventional scanner, the input string \texttt{shell=progman.exe} +would be turned into a single token of type \texttt{VAL}. The Yapps +scanner, however, knows that at the beginning of the line, an +\texttt{ID} is expected, so it will return \texttt{"shell"} as a token +of type \texttt{ID}. Later, it will return \texttt{"progman.exe"} as +a token of type \texttt{VAL}. + +Context sensitivity decreases the separation between scanner and +parser, but it is useful in parsers like \texttt{IniFile}, where the +tokens themselves are not unambiguous, but \emph{are} unambiguous +given a particular stage in the parsing process. + +Unfortunately, context sensitivity can make it more difficult to +detect errors in the input. For example, in parsing a Pascal-like +language with ``begin'' and ``end'' as keywords, a context sensitive +scanner would only match ``end'' as the END token if the parser is in +a place that will accept the END token. If not, then the scanner +would match ``end'' as an identifier. To disable the context +sensitive scanner in Yapps, add the +\texttt{context-insensitive-scanner} option to the grammar: + +\begin{verbatim} +Parser X: + option: "context-insensitive-scanner" +\end{verbatim} + +Context-insensitive scanning makes the parser look cleaner as well. + +\mysubsection{Internal Variables} + +There are two internal fields that may be of use. The parser object +has two fields, \texttt{\_pos}, which is the index of the current +token being matched, and \texttt{\_scanner}, which is the scanner +object. The token itself can be retrieved by accessing the scanner +object and calling the \texttt{token} method with the token index. However, if you call \texttt{token} before the token has been requested by the parser, it may mess up a context-sensitive scanner.\footnote{When using a context-sensitive scanner, the parser tells the scanner what the valid token types are at each point. If you call \texttt{token} before the parser can tell the scanner the valid token types, the scanner will attempt to match without considering the context.} A +potentially useful combination of these fields is to extract the +portion of the input matched by the current rule. To do this, just save the scanner state (\texttt{\_scanner.pos}) before the text is matched and then again after the text is matched: + +\begin{verbatim} + rule R: + {{ start = self._scanner.pos }} + a b c + {{ end = self._scanner.pos }} + {{ print 'Text is', self._scanner.input[start:end] }} +\end{verbatim} + +\mysubsection{Pre- and Post-Parser Code} + +Sometimes the parser code needs to rely on helper variables, +functions, and classes. A Yapps grammar can optionally be surrounded +by double percent signs, to separate the grammar from Python code. + +\begin{verbatim} +... Python code ... +%% +... Yapps grammar ... +%% +... Python code ... +\end{verbatim} + +The second \verb|%%| can be omitted if there is no Python code at the +end, and the first \verb|%%| can be omitted if there is no extra +Python code at all. (To have code only at the end, both separators +are required.) + +If the second \verb|%%| is omitted, Yapps will insert testing code +that allows you to use the generated parser to parse a file. + +The extended calculator example in the Yapps examples subdirectory +includes both pre-parser and post-parser code. + +\mysubsection{Representation of Grammars} + +For each kind of pattern there is a class derived from Pattern. Yapps +has classes for Terminal, NonTerminal, Sequence, Choice, Option, Plus, +Star, and Eval. Each of these classes has the following interface: + +\begin{itemize} + \item[setup(\emph{gen})] Set accepts-$\epsilon$, and call + \emph{gen.changed()} if it changed. This function can change the + flag from false to true but \emph{not} from true to false. + \item[update(\emph(gen))] Set \first and \follow, and call + \emph{gen.changed()} if either changed. This function can add to + the sets but \emph{not} remove from them. + \item[output(\emph{gen}, \emph{indent})] Generate code for matching + this rule, using \emph{indent} as the current indentation level. + Writes are performed using \emph{gen.write}. + \item[used(\emph{vars})] Given a list of variables \emph{vars}, + return two lists: one containing the variables that are used, and + one containing the variables that are assigned. This function is + used for optimizing the resulting code. +\end{itemize} + +Both \emph{setup} and \emph{update} monotonically increase the +variables they modify. Since the variables can only increase a finite +number of times, we can repeatedly call the function until the +variable stabilized. The \emph{used} function is not currently +implemented. + +With each pattern in the grammar Yapps associates three pieces of +information: the \first set, the \follow set, and the +accepts-$\epsilon$ flag. + +The \first set contains the tokens that can appear as we start +matching the pattern. The \follow set contains the tokens that can +appear immediately after we match the pattern. The accepts-$\epsilon$ +flag is true if the pattern can match no tokens. In this case, \first +will contain all the elements in \follow. The \follow set is not +needed when accepts-$\epsilon$ is false, and may not be accurate in +those cases. + +Yapps does not compute these sets precisely. Its approximation can +miss certain cases, such as this one: + +\begin{verbatim} + rule C: ( A* | B ) + rule B: C [A] +\end{verbatim} + +Yapps will calculate {\tt C}'s \follow set to include {\tt A}. +However, {\tt C} will always match all the {\tt A}'s, so {\tt A} will +never follow it. Yapps 2.0 does not properly handle this construct, +but if it seems important, I may add support for it in a future +version. + +Yapps also cannot handle constructs that depend on the calling +sequence. For example: + +\begin{verbatim} + rule R: U | 'b' + rule S: | 'c' + rule T: S 'b' + rule U: S 'a' +\end{verbatim} + +The \follow set for {\tt S} includes {\tt a} and {\tt b}. Since {\tt + S} can be empty, the \first set for {\tt S} should include {\tt a}, +{\tt b}, and {\tt c}. However, when parsing {\tt R}, if the lookahead +is {\tt b} we should \emph{not} parse {\tt U}. That's because in {\tt + U}, {\tt S} is followed by {\tt a} and not {\tt b}. Therefore in +{\tt R}, we should choose rule {\tt U} only if there is an {\tt a} or +{\tt c}, but not if there is a {\tt b}. Yapps and many other LL(1) +systems do not distinguish {\tt S b} and {\tt S a}, making {\tt + S}'s \follow set {\tt a, b}, and making {\tt R} always try to match +{\tt U}. In this case we can solve the problem by changing {\tt R} to +\verb:'b' | U: but it may not always be possible to solve all such +problems in this way. + +\appendix + +\mysection{Grammar for Parsers} + +This is the grammar for parsers, without any Python code mixed in. +The complete grammar can be found in \texttt{parsedesc.g} in the Yapps +distribution. + +\begin{verbatim} +parser ParserDescription: + ignore: "\\s+" + ignore: "#.*?\r?\n" + token END: "$" # $ means end of string + token ATTR: "<<.+?>>" + token STMT: "{{.+?}}" + token ID: '[a-zA-Z_][a-zA-Z_0-9]*' + token STR: '[rR]?\'([^\\n\'\\\\]|\\\\.)*\'|[rR]?"([^\\n"\\\\]|\\\\.)*"' + + rule Parser: "parser" ID ":" + Options + Tokens + Rules + END + + rule Options: ( "option" ":" STR )* + rule Tokens: ( "token" ID ":" STR | "ignore" ":" STR )* + rule Rules: ( "rule" ID OptParam ":" ClauseA )* + + rule ClauseA: ClauseB ( '[|]' ClauseB )* + rule ClauseB: ClauseC* + rule ClauseC: ClauseD [ '[+]' | '[*]' ] + rule ClauseD: STR | ID [ATTR] | STMT + | '\\(' ClauseA '\\) | '\\[' ClauseA '\\]' +\end{verbatim} + +\mysection{Upgrading} + +Yapps 2.0 is not backwards compatible with Yapps 1.0. In this section +are some tips for upgrading: + +\begin{enumerate} + \item Yapps 1.0 was distributed as a single file. Yapps 2.0 is + instead distributed as two Python files: a \emph{parser generator} + (26k) and a \emph{parser runtime} (5k). You need both files to + create parsers, but you need only the runtime (\texttt{yappsrt.py}) + to use the parsers. + + \item Yapps 1.0 supported Python 1.4 regular expressions from the + \texttt{regex} module. Yapps 2.0 uses Python 1.5 regular + expressions from the \texttt{re} module. \emph{The new syntax for + regular expressions is not compatible with the old syntax.} + Andrew Kuchling has a \htmladdnormallink{guide to converting + regular + expressions}{http://www.python.org/doc/howto/regex-to-re/} on his + web page. + + \item Yapps 1.0 wants a pattern and then a return value in \verb|->| + \verb|<<...>>|. Yapps 2.0 allows patterns and Python statements to + be mixed. To convert a rule like this: + +\begin{verbatim} +rule R: A B C -> << E1 >> + | X Y Z -> << E2 >> +\end{verbatim} + + to Yapps 2.0 form, replace the return value specifiers with return + statements: + +\begin{verbatim} +rule R: A B C {{ return E1 }} + | X Y Z {{ return E2 }} +\end{verbatim} + + \item Yapps 2.0 does not perform tail recursion elimination. This + means any recursive rules you write will be turned into recursive + methods in the parser. The parser will work, but may be slower. + It can be made faster by rewriting recursive rules, using instead + the looping operators \verb|*| and \verb|+| provided in Yapps 2.0. + +\end{enumerate} + +\mysection{Troubleshooting} + +\begin{itemize} + \item A common error is to write a grammar that doesn't have an END + token. End tokens are needed when it is not clear when to stop + parsing. For example, when parsing the expression {\tt 3+5}, it is + not clear after reading {\tt 3} whether to treat it as a complete + expression or whether the parser should continue reading. + Therefore the grammar for numeric expressions should include an end + token. Another example is the grammar for Lisp expressions. In + Lisp, it is always clear when you should stop parsing, so you do + \emph{not} need an end token. In fact, it may be more useful not + to have an end token, so that you can read in several Lisp expressions. + \item If there is a chance of ambiguity, make sure to put the choices + in the order you want them checked. Usually the most specific + choice should be first. Empty sequences should usually be last. + \item The context sensitive scanner is not appropriate for all + grammars. You might try using the insensitive scanner with the + {\tt context-insensitive-scanner} option in the grammar. + \item If performance turns out to be a problem, try writing a custom + scanner. The Yapps scanner is rather slow (but flexible and easy + to understand). +\end{itemize} + +\mysection{History} + +Yapps 1 had several limitations that bothered me while writing +parsers: + +\begin{enumerate} + \item It was not possible to insert statements into the generated + parser. A common workaround was to write an auxilliary function + that executed those statements, and to call that function as part + of the return value calculation. For example, several of my + parsers had an ``append(x,y)'' function that existed solely to call + ``x.append(y)''. + \item The way in which grammars were specified was rather + restrictive: a rule was a choice of clauses. Each clause was a + sequence of tokens and rule names, followed by a return value. + \item Optional matching had to be put into a separate rule because + choices were only made at the beginning of a rule. + \item Repetition had to be specified in terms of recursion. Not only + was this awkward (sometimes requiring additional rules), I had to + add a tail recursion optimization to Yapps to transform the + recursion back into a loop. +\end{enumerate} + +Yapps 2 addresses each of these limitations. + +\begin{enumerate} + \item Statements can occur anywhere within a rule. (However, only + one-line statements are allowed; multiline blocks marked by + indentation are not.) + \item Grammars can be specified using any mix of sequences, choices, + tokens, and rule names. To allow for complex structures, + parentheses can be used for grouping. + \item Given choices and parenthesization, optional matching can be + expressed as a choice between some pattern and nothing. In + addition, Yapps 2 has the convenience syntax \verb|[A B ...]| for + matching \verb|A B ...| optionally. + \item Repetition operators \verb|*| for zero or more and \verb|+| for + one or more make it easy to specify repeating patterns. +\end{enumerate} + +It is my hope that Yapps 2 will be flexible enough to meet my needs +for another year, yet simple enough that I do not hesitate to use it. + +\mysection{Debian Extensions} +\label{sec:debian} + +The Debian version adds the following enhancements to the original +Yapps code. They were written by Matthias Urlichs. + +\begin{enumerate} + \item Yapps can stack input sources ("include files"). A usage example + is supplied with the calc.g sample program. + \item Yapps now understands augmented ignore-able patterns. + This means that Yapps can parse multi-line C comments; this wasn't + possible before. + \item Better error reporting. + \item Yapps now reads its input incrementally. +\end{enumerate} + +The generated parser has been renamed to \texttt{yapps/runtime.py}. +In Debian, this file is provided by the \texttt{yapps2-runtime} package. +You need to depend on it if you Debianize Python programs which use +yapps. + +\mysection{Future Extensions} +\label{sec:future} + +I am still investigating the possibility of LL(2) and higher +lookahead. However, it looks like the resulting parsers will be +somewhat ugly. + +It would be nice to control choices with user-defined predicates. + +The most likely future extension is backtracking. A grammar pattern +like \verb|(VAR ':=' expr)? {{ return Assign(VAR,expr) }} : expr {{ return expr }}| +would turn into code that attempted to match \verb|VAR ':=' expr|. If +it succeeded, it would run \verb|{{ return ... }}|. If it failed, it +would match \verb|expr {{ return expr }}|. Backtracking may make it +less necessary to write LL(2) grammars. + +\mysection{References} + +\begin{enumerate} + \item The \htmladdnormallink{Python-Parser + SIG}{http://www.python.org/sigs/parser-sig/} is the first place + to look for a list of parser systems for Python. + + \item ANTLR/PCCTS, by Terrence Parr, is available at + \htmladdnormallink{The ANTLR Home Page}{http://www.antlr.org/}. + + \item PyLR, by Scott Cotton, is at \htmladdnormallink{his Starship + page}{http://starship.skyport.net/crew/scott/PyLR.html}. + + \item John Aycock's \htmladdnormallink{Compiling Little Languages + Framework}{http://www.foretec.com/python/workshops/1998-11/proceedings/papers/aycock-little/aycock-little.html}. + + \item PyBison, by Scott Hassan, can be found at + \htmladdnormallink{his Python Projects + page}{http://coho.stanford.edu/\~{}hassan/Python/}. + + \item mcf.pars, by Mike C. Fletcher, is available at + \htmladdnormallink{his web + page}{http://members.rogers.com/mcfletch/programming/simpleparse/simpleparse.html}. + + \item kwParsing, by Aaron Watters, is available at + \htmladdnormallink{his Starship + page}{http://starship.skyport.net/crew/aaron_watters/kwParsing/}. +\end{enumerate} + +\end{document} diff --git a/examples/calc.g b/examples/calc.g new file mode 100644 index 0000000..5432855 --- /dev/null +++ b/examples/calc.g @@ -0,0 +1,64 @@ +globalvars = {} # We will store the calculator's variables here + +def lookup(map, name): + for x,v in map: + if x == name: return v + if not globalvars.has_key(name): print 'Undefined (defaulting to 0):', name + return globalvars.get(name, 0) + +def stack_input(scanner,ign): + """Grab more input""" + scanner.stack_input(raw_input(">?> ")) + +%% +parser Calculator: + ignore: "[ \r\t\n]+" + ignore: "[?]" {{ stack_input }} + + token END: "$" + token NUM: "[0-9]+" + token VAR: "[a-zA-Z_]+" + + # Each line can either be an expression or an assignment statement + rule goal: expr<<[]>> END {{ print '=', expr }} + {{ return expr }} + | "set" VAR expr<<[]>> END {{ globalvars[VAR] = expr }} + {{ print VAR, '=', expr }} + {{ return expr }} + + # An expression is the sum and difference of factors + rule expr<<V>>: factor<<V>> {{ n = factor }} + ( "[+]" factor<<V>> {{ n = n+factor }} + | "-" factor<<V>> {{ n = n-factor }} + )* {{ return n }} + + # A factor is the product and division of terms + rule factor<<V>>: term<<V>> {{ v = term }} + ( "[*]" term<<V>> {{ v = v*term }} + | "/" term<<V>> {{ v = v/term }} + )* {{ return v }} + + # A term is a number, variable, or an expression surrounded by parentheses + rule term<<V>>: + NUM {{ return int(NUM) }} + | VAR {{ return lookup(V, VAR) }} + | "\\(" expr "\\)" {{ return expr }} + | "let" VAR "=" expr<<V>> {{ V = [(VAR, expr)] + V }} + "in" expr<<V>> {{ return expr }} +%% +if __name__=='__main__': + print 'Welcome to the calculator sample for Yapps 2.' + print ' Enter either "<expression>" or "set <var> <expression>",' + print ' or just press return to exit. An expression can have' + print ' local variables: let x = expr in expr' + # We could have put this loop into the parser, by making the + # `goal' rule use (expr | set var expr)*, but by putting the + # loop into Python code, we can make it interactive (i.e., enter + # one expression, get the result, enter another expression, etc.) + while 1: + try: s = raw_input('>>> ') + except EOFError: break + if not s.strip(): break + parse('goal', s) + print 'Bye.' + diff --git a/examples/expr.g b/examples/expr.g new file mode 100644 index 0000000..ae807b7 --- /dev/null +++ b/examples/expr.g @@ -0,0 +1,21 @@ +parser Calculator: + token END: "$" + token NUM: "[0-9]+" + + rule goal: expr END {{ return expr }} + + # An expression is the sum and difference of factors + rule expr: factor {{ v = factor }} + ( "[+]" factor {{ v = v+factor }} + | "-" factor {{ v = v-factor }} + )* {{ return v }} + + # A factor is the product and division of terms + rule factor: term {{ v = term }} + ( "[*]" term {{ v = v*term }} + | "/" term {{ v = v/term }} + )* {{ return v }} + + # A term is either a number or an expression surrounded by parentheses + rule term: NUM {{ return atoi(NUM) }} + | "\\(" expr "\\)" {{ return expr }} diff --git a/examples/lisp.g b/examples/lisp.g new file mode 100644 index 0000000..551e6f9 --- /dev/null +++ b/examples/lisp.g @@ -0,0 +1,13 @@ +parser Lisp: + ignore: r'\s+' + token NUM: r'[0-9]+' + token ID: r'[-+*/!@$%^&=.a-zA-Z0-9_]+' + token STR: r'"([^\\"]+|\\.)*"' + + rule expr: ID {{ return ('id',ID) }} + | STR {{ return ('str',eval(STR)) }} + | NUM {{ return ('num',int(NUM)) }} + | r"\(" + {{ e = [] }} # initialize the list + ( expr {{ e.append(expr) }} ) * # put each expr into the list + r"\)" {{ return e }} # return the list diff --git a/examples/notes b/examples/notes new file mode 100644 index 0000000..aa45bfb --- /dev/null +++ b/examples/notes @@ -0,0 +1,44 @@ +Hints +##### + +Some additional hints for your edification. + +Author: Matthias Urlichs <smurf@debian.org> + +How to process C preprocessor codes: +==================================== + +Rudimentary include handling has been added to the parser by me. + +However, if you want to do anything fancy, like for instance whatever +the C preprocessor does, things get more complicated. Fortunately, +there's already a nice tool to handle C preprocessing -- CPP itself. + +If you want to report errors correctly in that situation, do this: + + def set_line(s,m): + """Fixup the scanner's idea of the current line""" + s.filename = m.group(2) + line = int(m.group(1)) + s.del_line = line - s.line + + %% + parser whatever: + ignore: '^#\s*(\d+)\s*"([^"\n]+)"\s*\n' {{ set_line }} + ignore: '^#.*\n' + + [...] + %% + if __name__=='__main__': + import sys,os + for a in sys.argv[1:]: + f=os.popen("cpp "+repr(a),"r") + + P = whatever(whateverScanner("", filename=a, file=f)) + try: P.goal() + except runtime.SyntaxError, e: + runtime.print_error(e, P._scanner) + sys.exit(1) + + f.close() + diff --git a/examples/xml.g b/examples/xml.g new file mode 100644 index 0000000..cec2709 --- /dev/null +++ b/examples/xml.g @@ -0,0 +1,66 @@ +#!/usr/bin/python2 + +# xml.g +# +# Amit J. Patel, August 2003 +# +# Simple (non-conforming, non-validating) parsing of XML documents, +# based on Robert D. Cameron's "REX" shallow parser. It doesn't +# handle CDATA and lots of other stuff; it's meant to demonstrate +# Yapps, not replace a proper XML parser. + +%% + +parser xml: + token nodetext: r'[^<>]+' + token attrtext_singlequote: "[^']*" + token attrtext_doublequote: '[^"]*' + token SP: r'\s' + token id: r'[a-zA-Z_:][a-zA-Z0-9_:.-]*' + + rule node: + r'<!--.*?-->' {{ return ['!--comment'] }} + | r'<!\[CDATA\[.*?\]\]>' {{ return ['![CDATA['] }} + | r'<!' SP* id '[^>]*>' {{ return ['!doctype'] }} + | '<' SP* id SP* attributes SP* {{ startid = id }} + ( '>' nodes '</' SP* id SP* '>' {{ assert startid == id, 'Mismatched tags <%s> ... </%s>' % (startid, id) }} + {{ return [id, attributes] + nodes }} + | '/\s*>' {{ return [id, attributes] }} + ) + | nodetext {{ return nodetext }} + + rule nodes: {{ result = [] }} + ( node {{ result.append(node) }} + ) * {{ return result }} + + rule attribute: id SP* '=' SP* + ( '"' attrtext_doublequote '"' {{ return (id, attrtext_doublequote) }} + | "'" attrtext_singlequote "'" {{ return (id, attrtext_singlequote) }} + ) + + rule attributes: {{ result = {} }} + ( attribute SP* {{ result[attribute[0]] = attribute[1] }} + ) * {{ return result }} + +%% + +if __name__ == '__main__': + tests = ['<!-- hello -->', + 'some text', + '< bad xml', + '<br />', + '< spacey a = "foo" / >', + '<a href="foo">text ... </a>', + '<begin> middle </end>', + '<begin> <nested attr=\'baz\' another="hey"> foo </nested> <nested> bar </nested> </begin>', + ] + print + print '____Running tests_______________________________________' + for test in tests: + print + try: + parser = xml(xmlScanner(test)) + output = '%s ==> %s' % (repr(test), repr(parser.node())) + except (yappsrt.SyntaxError, AssertionError), e: + output = '%s ==> FAILED ==> %s' % (repr(test), e) + print output diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..0d7c98d --- /dev/null +++ b/setup.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python + +"""Setup script for 'yapps'""" + +from distutils.core import setup + +description = "Yet Another Python Parser System" +long_description = \ +""" +YAPPS is an easy to use parser generator that is written in Python and +generates Python code. There are several parser generator systems +already available for Python, but this parser has different goals: +Yapps is simple, very easy to use, and produces human-readable parsers. + +It is not the fastest or most powerful parser. Yapps is designed to be +used when regular expressions are not enough and other parser systems +are too much: situations where you might otherwise write your own +recursive descent parser. + +This package contains several upward-compatible enhancements to the +original YAPPS source: +- Handle stacked input ("include files") +- augmented ignore-able patterns (can parse multi-line C comments correctly) +- better error reporting +- read input incrementally +""" + +setup (name = "python-yapps", + version = "2.1.1", + description = description, + long_description = long_description, + author = "Amit J. Patel", + author_email = "amitp@cs.stanford.edu", + maintainer = "Matthias Urlichs", + maintainer_email = "smurf@debian.org", + url = "http://theory.stanford.edu/~amitp/yapps/", + license = 'MIT', + platforms = ['POSIX'], + keywords = ['parsing'], + packages = ['yapps'], + #cmdclass = {'bdist_rpm': MyBDist_RPM}, + ) diff --git a/test/empty_clauses.g b/test/empty_clauses.g new file mode 100644 index 0000000..3b435f2 --- /dev/null +++ b/test/empty_clauses.g @@ -0,0 +1,10 @@ +# This parser tests the use of OR clauses with one of them being empty +# +# The output of --dump should indicate the FOLLOW set for (A | ) is 'c'. + +parser Test: + rule TestPlus: ( A | ) 'c' + rule A: 'a'+ + + rule TestStar: ( B | ) 'c' + rule B: 'b'*
\ No newline at end of file diff --git a/test/line_numbers.g b/test/line_numbers.g new file mode 100644 index 0000000..19eee9a --- /dev/null +++ b/test/line_numbers.g @@ -0,0 +1,10 @@ +# +# The error messages produced by Yapps have a line number. +# The line number should take the Python code section into account. + +# The line number should be 10. + +%% + +parser error_1: + this_is_an_error; diff --git a/test/option.g b/test/option.g new file mode 100644 index 0000000..5980362 --- /dev/null +++ b/test/option.g @@ -0,0 +1,17 @@ + +%% + +parser test_option: + ignore: r'\s+' + token a: 'a' + token b: 'b' + token EOF: r'$' + + rule test_brackets: a [b] EOF + + rule test_question_mark: a b? EOF + +%% + +# The generated code for test_brackets and test_question_mark should +# be the same. diff --git a/yapps/__init__.py b/yapps/__init__.py new file mode 100644 index 0000000..1bb8bf6 --- /dev/null +++ b/yapps/__init__.py @@ -0,0 +1 @@ +# empty diff --git a/yapps/grammar.py b/yapps/grammar.py new file mode 100644 index 0000000..1714344 --- /dev/null +++ b/yapps/grammar.py @@ -0,0 +1,211 @@ +# grammar.py, part of Yapps 2 - yet another python parser system +# Copyright 1999-2003 by Amit J. Patel <amitp@cs.stanford.edu> +# +# This version of the Yapps 2 grammar can be distributed under the +# terms of the MIT open source license, either found in the LICENSE +# file included with the Yapps distribution +# <http://theory.stanford.edu/~amitp/yapps/> or at +# <http://www.opensource.org/licenses/mit-license.php> +# + +"""Parser for Yapps grammars. + +This file defines the grammar of Yapps grammars. Naturally, it is +implemented in Yapps. The grammar.py module needed by Yapps is built +by running Yapps on yapps_grammar.g. (Holy circularity, Batman!) + +""" + +import sys, re +from yapps import parsetree + +###################################################################### +def cleanup_choice(rule, lst): + if len(lst) == 0: return Sequence(rule, []) + if len(lst) == 1: return lst[0] + return parsetree.Choice(rule, *tuple(lst)) + +def cleanup_sequence(rule, lst): + if len(lst) == 1: return lst[0] + return parsetree.Sequence(rule, *tuple(lst)) + +def resolve_name(rule, tokens, id, args): + if id in [x[0] for x in tokens]: + # It's a token + if args: + print 'Warning: ignoring parameters on TOKEN %s<<%s>>' % (id, args) + return parsetree.Terminal(rule, id) + else: + # It's a name, so assume it's a nonterminal + return parsetree.NonTerminal(rule, id, args) + + +# Begin -- grammar generated by Yapps +import sys, re +from yapps import runtime + +class ParserDescriptionScanner(runtime.Scanner): + patterns = [ + ('"rule"', re.compile('rule')), + ('"ignore"', re.compile('ignore')), + ('"token"', re.compile('token')), + ('"option"', re.compile('option')), + ('":"', re.compile(':')), + ('"parser"', re.compile('parser')), + ('[ \t\r\n]+', re.compile('[ \t\r\n]+')), + ('#.*?\r?\n', re.compile('#.*?\r?\n')), + ('EOF', re.compile('$')), + ('ATTR', re.compile('<<.+?>>')), + ('STMT', re.compile('{{.+?}}')), + ('ID', re.compile('[a-zA-Z_][a-zA-Z_0-9]*')), + ('STR', re.compile('[rR]?\'([^\\n\'\\\\]|\\\\.)*\'|[rR]?"([^\\n"\\\\]|\\\\.)*"')), + ('LP', re.compile('\\(')), + ('RP', re.compile('\\)')), + ('LB', re.compile('\\[')), + ('RB', re.compile('\\]')), + ('OR', re.compile('[|]')), + ('STAR', re.compile('[*]')), + ('PLUS', re.compile('[+]')), + ('QUEST', re.compile('[?]')), + ('COLON', re.compile(':')), + ] + def __init__(self, str,*args,**kw): + runtime.Scanner.__init__(self,None,{'[ \t\r\n]+':None,'#.*?\r?\n':None,},str,*args,**kw) + +class ParserDescription(runtime.Parser): + Context = runtime.Context + def Parser(self, _parent=None): + _context = self.Context(_parent, self._scanner, 'Parser', []) + self._scan('"parser"', context=_context) + ID = self._scan('ID', context=_context) + self._scan('":"', context=_context) + Options = self.Options(_context) + Tokens = self.Tokens(_context) + Rules = self.Rules(Tokens, _context) + EOF = self._scan('EOF', context=_context) + return parsetree.Generator(ID,Options,Tokens,Rules) + + def Options(self, _parent=None): + _context = self.Context(_parent, self._scanner, 'Options', []) + opt = {} + while self._peek('"option"', '"token"', '"ignore"', 'EOF', '"rule"', context=_context) == '"option"': + self._scan('"option"', context=_context) + self._scan('":"', context=_context) + Str = self.Str(_context) + opt[Str] = 1 + return opt + + def Tokens(self, _parent=None): + _context = self.Context(_parent, self._scanner, 'Tokens', []) + tok = [] + while self._peek('"token"', '"ignore"', 'EOF', '"rule"', context=_context) in ['"token"', '"ignore"']: + _token = self._peek('"token"', '"ignore"', context=_context) + if _token == '"token"': + self._scan('"token"', context=_context) + ID = self._scan('ID', context=_context) + self._scan('":"', context=_context) + Str = self.Str(_context) + tok.append( (ID,Str) ) + else: # == '"ignore"' + self._scan('"ignore"', context=_context) + self._scan('":"', context=_context) + Str = self.Str(_context) + ign = ('#ignore',Str) + if self._peek('STMT', '"token"', '"ignore"', 'EOF', '"rule"', context=_context) == 'STMT': + STMT = self._scan('STMT', context=_context) + ign = ign + (STMT[2:-2],) + tok.append( ign ) + return tok + + def Rules(self, tokens, _parent=None): + _context = self.Context(_parent, self._scanner, 'Rules', [tokens]) + rul = [] + while self._peek('"rule"', 'EOF', context=_context) == '"rule"': + self._scan('"rule"', context=_context) + ID = self._scan('ID', context=_context) + OptParam = self.OptParam(_context) + self._scan('":"', context=_context) + ClauseA = self.ClauseA(ID, tokens, _context) + rul.append( (ID, OptParam, ClauseA) ) + return rul + + def ClauseA(self, rule, tokens, _parent=None): + _context = self.Context(_parent, self._scanner, 'ClauseA', [rule, tokens]) + ClauseB = self.ClauseB(rule,tokens, _context) + v = [ClauseB] + while self._peek('OR', 'RP', 'RB', '"rule"', 'EOF', context=_context) == 'OR': + OR = self._scan('OR', context=_context) + ClauseB = self.ClauseB(rule,tokens, _context) + v.append(ClauseB) + return cleanup_choice(rule,v) + + def ClauseB(self, rule,tokens, _parent=None): + _context = self.Context(_parent, self._scanner, 'ClauseB', [rule,tokens]) + v = [] + while self._peek('STR', 'ID', 'LP', 'LB', 'STMT', 'OR', 'RP', 'RB', '"rule"', 'EOF', context=_context) in ['STR', 'ID', 'LP', 'LB', 'STMT']: + ClauseC = self.ClauseC(rule,tokens, _context) + v.append(ClauseC) + return cleanup_sequence(rule, v) + + def ClauseC(self, rule,tokens, _parent=None): + _context = self.Context(_parent, self._scanner, 'ClauseC', [rule,tokens]) + ClauseD = self.ClauseD(rule,tokens, _context) + _token = self._peek('PLUS', 'STAR', 'QUEST', 'STR', 'ID', 'LP', 'LB', 'STMT', 'OR', 'RP', 'RB', '"rule"', 'EOF', context=_context) + if _token == 'PLUS': + PLUS = self._scan('PLUS', context=_context) + return parsetree.Plus(rule, ClauseD) + elif _token == 'STAR': + STAR = self._scan('STAR', context=_context) + return parsetree.Star(rule, ClauseD) + elif _token == 'QUEST': + QUEST = self._scan('QUEST', context=_context) + return parsetree.Option(rule, ClauseD) + else: + return ClauseD + + def ClauseD(self, rule,tokens, _parent=None): + _context = self.Context(_parent, self._scanner, 'ClauseD', [rule,tokens]) + _token = self._peek('STR', 'ID', 'LP', 'LB', 'STMT', context=_context) + if _token == 'STR': + STR = self._scan('STR', context=_context) + t = (STR, eval(STR,{},{})) + if t not in tokens: tokens.insert( 0, t ) + return parsetree.Terminal(rule, STR) + elif _token == 'ID': + ID = self._scan('ID', context=_context) + OptParam = self.OptParam(_context) + return resolve_name(rule,tokens, ID, OptParam) + elif _token == 'LP': + LP = self._scan('LP', context=_context) + ClauseA = self.ClauseA(rule,tokens, _context) + RP = self._scan('RP', context=_context) + return ClauseA + elif _token == 'LB': + LB = self._scan('LB', context=_context) + ClauseA = self.ClauseA(rule,tokens, _context) + RB = self._scan('RB', context=_context) + return parsetree.Option(rule, ClauseA) + else: # == 'STMT' + STMT = self._scan('STMT', context=_context) + return parsetree.Eval(rule, STMT[2:-2]) + + def OptParam(self, _parent=None): + _context = self.Context(_parent, self._scanner, 'OptParam', []) + if self._peek('ATTR', '":"', 'PLUS', 'STAR', 'QUEST', 'STR', 'ID', 'LP', 'LB', 'STMT', 'OR', 'RP', 'RB', '"rule"', 'EOF', context=_context) == 'ATTR': + ATTR = self._scan('ATTR', context=_context) + return ATTR[2:-2] + return '' + + def Str(self, _parent=None): + _context = self.Context(_parent, self._scanner, 'Str', []) + STR = self._scan('STR', context=_context) + return eval(STR,{},{}) + + +def parse(rule, text): + P = ParserDescription(ParserDescriptionScanner(text)) + return runtime.wrap_error_reporter(P, rule) + +# End -- grammar generated by Yapps + + diff --git a/yapps/parsetree.py b/yapps/parsetree.py new file mode 100644 index 0000000..e5e0ae0 --- /dev/null +++ b/yapps/parsetree.py @@ -0,0 +1,673 @@ +# parsetree.py, part of Yapps 2 - yet another python parser system +# Copyright 1999-2003 by Amit J. Patel <amitp@cs.stanford.edu> +# +# This version of the Yapps 2 Runtime can be distributed under the +# terms of the MIT open source license, either found in the LICENSE file +# included with the Yapps distribution +# <http://theory.stanford.edu/~amitp/yapps/> or at +# <http://www.opensource.org/licenses/mit-license.php> +# + +"""Classes used to represent parse trees and generate output. + +This module defines the Generator class, which drives the generation +of Python output from a grammar parse tree. It also defines nodes +used to represent the parse tree; they are derived from class Node. + +The main logic of Yapps is in this module. +""" + +import sys, re + +###################################################################### +INDENT = ' '*4 +class Generator: + + # TODO: many of the methods here should be class methods, not instance methods + + def __init__(self, name, options, tokens, rules): + self.change_count = 0 + self.name = name + self.options = options + self.preparser = '' + self.postparser = None + + self.tokens = {} # Map from tokens to regexps + self.ignore = {} # List of token names to ignore in parsing, map to statements + self.terminals = [] # List of token names (to maintain ordering) + for t in tokens: + if len(t) == 3: + n,t,s = t + else: + n,t = t + s = None + + if n == '#ignore': + n = t + self.ignore[n] = s + if n in self.tokens.keys() and self.tokens[n] != t: + print >>sys.stderr, 'Warning: token %s defined more than once.' % n + self.tokens[n] = t + self.terminals.append(n) + + self.rules = {} # Map from rule names to parser nodes + self.params = {} # Map from rule names to parameters + self.goals = [] # List of rule names (to maintain ordering) + for n,p,r in rules: + self.params[n] = p + self.rules[n] = r + self.goals.append(n) + + self.output = sys.stdout + + def has_option(self, name): + return self.options.get(name, 0) + + def non_ignored_tokens(self): + return [x for x in self.terminals if x not in self.ignore] + + def changed(self): + """Increments the change count. + + >>> t = Generator('', [], [], []) + >>> old_count = t.change_count + >>> t.changed() + >>> assert t.change_count == old_count + 1 + """ + self.change_count = 1+self.change_count + + def set_subtract(self, a, b): + """Returns the elements of a that are not in b. + + >>> t = Generator('', [], [], []) + >>> t.set_subtract([], []) + [] + >>> t.set_subtract([1, 2], [1, 2]) + [] + >>> t.set_subtract([1, 2, 3], [2]) + [1, 3] + >>> t.set_subtract([1], [2, 3, 4]) + [1] + """ + result = [] + for x in a: + if x not in b: + result.append(x) + return result + + def subset(self, a, b): + """True iff all elements of sequence a are inside sequence b + + >>> t = Generator('', [], [], []) + >>> t.subset([], [1, 2, 3]) + 1 + >>> t.subset([1, 2, 3], []) + 0 + >>> t.subset([1], [1, 2, 3]) + 1 + >>> t.subset([3, 2, 1], [1, 2, 3]) + 1 + >>> t.subset([1, 1, 1], [1, 2, 3]) + 1 + >>> t.subset([1, 2, 3], [1, 1, 1]) + 0 + """ + for x in a: + if x not in b: + return 0 + return 1 + + def equal_set(self, a, b): + """True iff subset(a, b) and subset(b, a) + + >>> t = Generator('', [], [], []) + >>> a_set = [1, 2, 3] + >>> t.equal_set(a_set, a_set) + 1 + >>> t.equal_set(a_set, a_set[:]) + 1 + >>> t.equal_set([], a_set) + 0 + >>> t.equal_set([1, 2, 3], [3, 2, 1]) + 1 + """ + if len(a) != len(b): return 0 + if a == b: return 1 + return self.subset(a, b) and self.subset(b, a) + + def add_to(self, parent, additions): + "Modify _parent_ to include all elements in _additions_" + for x in additions: + if x not in parent: + parent.append(x) + self.changed() + + def equate(self, a, b): + """Extend (a) and (b) so that they contain each others' elements. + + >>> t = Generator('', [], [], []) + >>> a = [1, 2] + >>> b = [2, 3] + >>> t.equate(a, b) + >>> a + [1, 2, 3] + >>> b + [2, 3, 1] + """ + self.add_to(a, b) + self.add_to(b, a) + + def write(self, *args): + for a in args: + self.output.write(a) + + def in_test(self, expr, full, set): + """Generate a test of (expr) being in (set), where (set) is a subset of (full) + + expr is a string (Python expression) + set is a list of values (which will be converted with repr) + full is the list of all values expr could possibly evaluate to + + >>> t = Generator('', [], [], []) + >>> t.in_test('x', [1,2,3,4], []) + '0' + >>> t.in_test('x', [1,2,3,4], [1,2,3,4]) + '1' + >>> t.in_test('x', [1,2,3,4], [1]) + 'x == 1' + >>> t.in_test('a+b', [1,2,3,4], [1,2]) + 'a+b in [1, 2]' + >>> t.in_test('x', [1,2,3,4,5], [1,2,3]) + 'x not in [4, 5]' + >>> t.in_test('x', [1,2,3,4,5], [1,2,3,4]) + 'x != 5' + """ + + if not set: return '0' + if len(set) == 1: return '%s == %s' % (expr, repr(set[0])) + if full and len(set) > len(full)/2: + # Reverse the sense of the test. + not_set = [x for x in full if x not in set] + return self.not_in_test(expr, full, not_set) + return '%s in %s' % (expr, repr(set)) + + def not_in_test(self, expr, full, set): + """Like in_test, but the reverse test.""" + if not set: return '1' + if len(set) == 1: return '%s != %s' % (expr, repr(set[0])) + return '%s not in %s' % (expr, repr(set)) + + def peek_call(self, a): + """Generate a call to scan for a token in the set 'a'""" + assert type(a) == type([]) + a_set = (repr(a)[1:-1]) + if self.equal_set(a, self.non_ignored_tokens()): a_set = '' + if self.has_option('context-insensitive-scanner'): a_set = '' + if a_set: a_set += "," + + return 'self._peek(%s context=_context)' % a_set + + def peek_test(self, a, b): + """Generate a call to test whether the next token (which could be any of + the elements in a) is in the set b.""" + if self.subset(a, b): return '1' + if self.has_option('context-insensitive-scanner'): a = self.non_ignored_tokens() + return self.in_test(self.peek_call(a), a, b) + + def not_peek_test(self, a, b): + """Like peek_test, but the opposite sense.""" + if self.subset(a, b): return '0' + return self.not_in_test(self.peek_call(a), a, b) + + def calculate(self): + """The main loop to compute the epsilon, first, follow sets. + The loop continues until the sets converge. This works because + each set can only get larger, so when they stop getting larger, + we're done.""" + # First we determine whether a rule accepts epsilon (the empty sequence) + while 1: + for r in self.goals: + self.rules[r].setup(self) + if self.change_count == 0: break + self.change_count = 0 + + # Now we compute the first/follow sets + while 1: + for r in self.goals: + self.rules[r].update(self) + if self.change_count == 0: break + self.change_count = 0 + + def dump_information(self): + """Display the grammar in somewhat human-readable form.""" + self.calculate() + for r in self.goals: + print ' _____' + '_'*len(r) + print ('___/Rule '+r+'\\' + '_'*80)[:79] + queue = [self.rules[r]] + while queue: + top = queue[0] + del queue[0] + + print 'Rule', repr(top), 'of class', top.__class__.__name__ + top.first.sort() + top.follow.sort() + eps = [] + if top.accepts_epsilon: eps = ['(null)'] + print ' FIRST:', ', '.join(top.first+eps) + print ' FOLLOW:', ', '.join(top.follow) + for x in top.get_children(): queue.append(x) + + def repr_ignore(self): + out="{" + for t,s in self.ignore.iteritems(): + if s is None: s=repr(s) + out += "%s:%s," % (repr(t),s) + out += "}" + return out + + def generate_output(self): + self.calculate() + self.write(self.preparser) + self.write("# Begin -- grammar generated by Yapps\n") + self.write("import sys, re\n") + self.write("from yapps import runtime\n") + self.write("\n") + self.write("class ", self.name, "Scanner(runtime.Scanner):\n") + self.write(" patterns = [\n") + for p in self.terminals: + self.write(" (%s, re.compile(%s)),\n" % ( + repr(p), repr(self.tokens[p]))) + self.write(" ]\n") + self.write(" def __init__(self, str,*args,**kw):\n") + self.write(" runtime.Scanner.__init__(self,None,%s,str,*args,**kw)\n" % + self.repr_ignore()) + self.write("\n") + + self.write("class ", self.name, "(runtime.Parser):\n") + self.write(INDENT, "Context = runtime.Context\n") + for r in self.goals: + self.write(INDENT, "def ", r, "(self") + if self.params[r]: self.write(", ", self.params[r]) + self.write(", _parent=None):\n") + self.write(INDENT+INDENT, "_context = self.Context(_parent, self._scanner, %s, [%s])\n" % + (repr(r), self.params.get(r, ''))) + self.rules[r].output(self, INDENT+INDENT) + self.write("\n") + + self.write("\n") + self.write("def parse(rule, text):\n") + self.write(" P = ", self.name, "(", self.name, "Scanner(text))\n") + self.write(" return runtime.wrap_error_reporter(P, rule)\n") + self.write("\n") + if self.postparser is not None: + self.write("# End -- grammar generated by Yapps\n") + self.write(self.postparser) + else: + self.write("if __name__ == '__main__':\n") + self.write(INDENT, "from sys import argv, stdin\n") + self.write(INDENT, "if len(argv) >= 2:\n") + self.write(INDENT*2, "if len(argv) >= 3:\n") + self.write(INDENT*3, "f = open(argv[2],'r')\n") + self.write(INDENT*2, "else:\n") + self.write(INDENT*3, "f = stdin\n") + self.write(INDENT*2, "print parse(argv[1], f.read())\n") + self.write(INDENT, "else: print >>sys.stderr, 'Args: <rule> [<filename>]'\n") + self.write("# End -- grammar generated by Yapps\n") + +###################################################################### +class Node: + """This is the base class for all components of a grammar.""" + def __init__(self, rule): + self.rule = rule # name of the rule containing this node + self.first = [] + self.follow = [] + self.accepts_epsilon = 0 + + def setup(self, gen): + # Setup will change accepts_epsilon, + # sometimes from 0 to 1 but never 1 to 0. + # It will take a finite number of steps to set things up + pass + + def used(self, vars): + "Return two lists: one of vars used, and the other of vars assigned" + return vars, [] + + def get_children(self): + "Return a list of sub-nodes" + return [] + + def __repr__(self): + return str(self) + + def update(self, gen): + if self.accepts_epsilon: + gen.add_to(self.first, self.follow) + + def output(self, gen, indent): + "Write out code to _gen_ with _indent_:string indentation" + gen.write(indent, "assert 0 # Invalid parser node\n") + +class Terminal(Node): + """This class stores terminal nodes, which are tokens.""" + def __init__(self, rule, token): + Node.__init__(self, rule) + self.token = token + self.accepts_epsilon = 0 + + def __str__(self): + return self.token + + def update(self, gen): + Node.update(self, gen) + if self.first != [self.token]: + self.first = [self.token] + gen.changed() + + def output(self, gen, indent): + gen.write(indent) + if re.match('[a-zA-Z_][a-zA-Z_0-9]*$', self.token): + gen.write(self.token, " = ") + gen.write("self._scan(%s, context=_context)\n" % repr(self.token)) + +class Eval(Node): + """This class stores evaluation nodes, from {{ ... }} clauses.""" + def __init__(self, rule, expr): + Node.__init__(self, rule) + self.expr = expr + + def setup(self, gen): + Node.setup(self, gen) + if not self.accepts_epsilon: + self.accepts_epsilon = 1 + gen.changed() + + def __str__(self): + return '{{ %s }}' % self.expr.strip() + + def output(self, gen, indent): + gen.write(indent, self.expr.strip(), '\n') + +class NonTerminal(Node): + """This class stores nonterminal nodes, which are rules with arguments.""" + def __init__(self, rule, name, args): + Node.__init__(self, rule) + self.name = name + self.args = args + + def setup(self, gen): + Node.setup(self, gen) + try: + self.target = gen.rules[self.name] + if self.accepts_epsilon != self.target.accepts_epsilon: + self.accepts_epsilon = self.target.accepts_epsilon + gen.changed() + except KeyError: # Oops, it's nonexistent + print >>sys.stderr, 'Error: no rule <%s>' % self.name + self.target = self + + def __str__(self): + return '%s' % self.name + + def update(self, gen): + Node.update(self, gen) + gen.equate(self.first, self.target.first) + gen.equate(self.follow, self.target.follow) + + def output(self, gen, indent): + gen.write(indent) + gen.write(self.name, " = ") + args = self.args + if args: args += ', ' + args += '_context' + gen.write("self.", self.name, "(", args, ")\n") + +class Sequence(Node): + """This class stores a sequence of nodes (A B C ...)""" + def __init__(self, rule, *children): + Node.__init__(self, rule) + self.children = children + + def setup(self, gen): + Node.setup(self, gen) + for c in self.children: c.setup(gen) + + if not self.accepts_epsilon: + # If it's not already accepting epsilon, it might now do so. + for c in self.children: + # any non-epsilon means all is non-epsilon + if not c.accepts_epsilon: break + else: + self.accepts_epsilon = 1 + gen.changed() + + def get_children(self): + return self.children + + def __str__(self): + return '( %s )' % ' '.join(map(str, self.children)) + + def update(self, gen): + Node.update(self, gen) + for g in self.children: + g.update(gen) + + empty = 1 + for g_i in range(len(self.children)): + g = self.children[g_i] + + if empty: gen.add_to(self.first, g.first) + if not g.accepts_epsilon: empty = 0 + + if g_i == len(self.children)-1: + next = self.follow + else: + next = self.children[1+g_i].first + gen.add_to(g.follow, next) + + if self.children: + gen.add_to(self.follow, self.children[-1].follow) + + def output(self, gen, indent): + if self.children: + for c in self.children: + c.output(gen, indent) + else: + # Placeholder for empty sequences, just in case + gen.write(indent, 'pass\n') + +class Choice(Node): + """This class stores a choice between nodes (A | B | C | ...)""" + def __init__(self, rule, *children): + Node.__init__(self, rule) + self.children = children + + def setup(self, gen): + Node.setup(self, gen) + for c in self.children: c.setup(gen) + + if not self.accepts_epsilon: + for c in self.children: + if c.accepts_epsilon: + self.accepts_epsilon = 1 + gen.changed() + + def get_children(self): + return self.children + + def __str__(self): + return '( %s )' % ' | '.join(map(str, self.children)) + + def update(self, gen): + Node.update(self, gen) + for g in self.children: + g.update(gen) + + for g in self.children: + gen.add_to(self.first, g.first) + gen.add_to(self.follow, g.follow) + for g in self.children: + gen.add_to(g.follow, self.follow) + if self.accepts_epsilon: + gen.add_to(self.first, self.follow) + + def output(self, gen, indent): + test = "if" + gen.write(indent, "_token = ", gen.peek_call(self.first), "\n") + tokens_seen = [] + tokens_unseen = self.first[:] + if gen.has_option('context-insensitive-scanner'): + # Context insensitive scanners can return ANY token, + # not only the ones in first. + tokens_unseen = gen.non_ignored_tokens() + for c in self.children: + testset = c.first[:] + removed = [] + for x in testset: + if x in tokens_seen: + testset.remove(x) + removed.append(x) + if x in tokens_unseen: tokens_unseen.remove(x) + tokens_seen = tokens_seen + testset + if removed: + if not testset: + print >>sys.stderr, 'Error in rule', self.rule+':' + else: + print >>sys.stderr, 'Warning in rule', self.rule+':' + print >>sys.stderr, ' *', self + print >>sys.stderr, ' * These tokens could be matched by more than one clause:' + print >>sys.stderr, ' *', ' '.join(removed) + + if testset: + if not tokens_unseen: # context sensitive scanners only! + if test == 'if': + # if it's the first AND last test, then + # we can simply put the code without an if/else + c.output(gen, indent) + else: + gen.write(indent, "else:") + t = gen.in_test('', [], testset) + if len(t) < 70-len(indent): + gen.write(' #', t) + gen.write("\n") + c.output(gen, indent+INDENT) + else: + gen.write(indent, test, " ", + gen.in_test('_token', tokens_unseen, testset), + ":\n") + c.output(gen, indent+INDENT) + test = "elif" + + if tokens_unseen: + gen.write(indent, "else:\n") + gen.write(indent, INDENT, "raise runtime.SyntaxError(_token[0], ") + gen.write("'Could not match ", self.rule, "')\n") + +class Wrapper(Node): + """This is a base class for nodes that modify a single child.""" + def __init__(self, rule, child): + Node.__init__(self, rule) + self.child = child + + def setup(self, gen): + Node.setup(self, gen) + self.child.setup(gen) + + def get_children(self): + return [self.child] + + def update(self, gen): + Node.update(self, gen) + self.child.update(gen) + gen.add_to(self.first, self.child.first) + gen.equate(self.follow, self.child.follow) + +class Option(Wrapper): + """This class represents an optional clause of the form [A]""" + def setup(self, gen): + Wrapper.setup(self, gen) + if not self.accepts_epsilon: + self.accepts_epsilon = 1 + gen.changed() + + def __str__(self): + return '[ %s ]' % str(self.child) + + def output(self, gen, indent): + if self.child.accepts_epsilon: + print >>sys.stderr, 'Warning in rule', self.rule+': contents may be empty.' + gen.write(indent, "if %s:\n" % + gen.peek_test(self.first, self.child.first)) + self.child.output(gen, indent+INDENT) + + if gen.has_option('context-insensitive-scanner'): + gen.write(indent, "if %s:\n" % + gen.not_peek_test(gen.non_ignored_tokens(), self.follow)) + gen.write(indent+INDENT, "raise runtime.SyntaxError(pos=self._scanner.get_pos(), context=_context, msg='Need one of ' + ', '.join(%s))\n" % + repr(self.first)) + + +class Plus(Wrapper): + """This class represents a 1-or-more repetition clause of the form A+""" + def setup(self, gen): + Wrapper.setup(self, gen) + if self.accepts_epsilon != self.child.accepts_epsilon: + self.accepts_epsilon = self.child.accepts_epsilon + gen.changed() + + def __str__(self): + return '%s+' % str(self.child) + + def update(self, gen): + Wrapper.update(self, gen) + gen.add_to(self.child.follow, self.child.first) + + def output(self, gen, indent): + if self.child.accepts_epsilon: + print >>sys.stderr, 'Warning in rule', self.rule+':' + print >>sys.stderr, ' * The repeated pattern could be empty. The resulting parser may not work properly.' + gen.write(indent, "while 1:\n") + self.child.output(gen, indent+INDENT) + union = self.first[:] + gen.add_to(union, self.follow) + gen.write(indent+INDENT, "if %s: break\n" % + gen.not_peek_test(union, self.child.first)) + + if gen.has_option('context-insensitive-scanner'): + gen.write(indent, "if %s:\n" % + gen.not_peek_test(gen.non_ignored_tokens(), self.follow)) + gen.write(indent+INDENT, "raise runtime.SyntaxError(pos=self._scanner.get_pos(), context=_context, msg='Need one of ' + ', '.join(%s))\n" % + repr(self.first)) + + +class Star(Wrapper): + """This class represents a 0-or-more repetition clause of the form A*""" + def setup(self, gen): + Wrapper.setup(self, gen) + if not self.accepts_epsilon: + self.accepts_epsilon = 1 + gen.changed() + + def __str__(self): + return '%s*' % str(self.child) + + def update(self, gen): + Wrapper.update(self, gen) + gen.add_to(self.child.follow, self.child.first) + + def output(self, gen, indent): + if self.child.accepts_epsilon: + print >>sys.stderr, 'Warning in rule', self.rule+':' + print >>sys.stderr, ' * The repeated pattern could be empty. The resulting parser probably will not work properly.' + gen.write(indent, "while %s:\n" % + gen.peek_test(self.follow, self.child.first)) + self.child.output(gen, indent+INDENT) + + # TODO: need to generate tests like this in lots of rules + if gen.has_option('context-insensitive-scanner'): + gen.write(indent, "if %s:\n" % + gen.not_peek_test(gen.non_ignored_tokens(), self.follow)) + gen.write(indent+INDENT, "raise runtime.SyntaxError(pos=self._scanner.get_pos(), context=_context, msg='Need one of ' + ', '.join(%s))\n" % + repr(self.first)) + diff --git a/yapps/runtime.py b/yapps/runtime.py new file mode 100644 index 0000000..5d9d1d6 --- /dev/null +++ b/yapps/runtime.py @@ -0,0 +1,442 @@ +# Yapps 2 Runtime, part of Yapps 2 - yet another python parser system +# Copyright 1999-2003 by Amit J. Patel <amitp@cs.stanford.edu> +# Enhancements copyright 2003-2004 by Matthias Urlichs <smurf@debian.org> +# +# This version of the Yapps 2 Runtime can be distributed under the +# terms of the MIT open source license, either found in the LICENSE file +# included with the Yapps distribution +# <http://theory.stanford.edu/~amitp/yapps/> or at +# <http://www.opensource.org/licenses/mit-license.php> +# + +"""Run time libraries needed to run parsers generated by Yapps. + +This module defines parse-time exception classes, a scanner class, a +base class for parsers produced by Yapps, and a context class that +keeps track of the parse stack. + +""" + +import sys, re + +MIN_WINDOW=4096 +# File lookup window + +class SyntaxError(Exception): + """When we run into an unexpected token, this is the exception to use""" + def __init__(self, pos=None, msg="Bad Token", context=None): + Exception.__init__(self) + self.pos = pos + self.msg = msg + self.context = context + + def __str__(self): + if not self.pos: return 'SyntaxError' + else: return 'SyntaxError@%s(%s)' % (repr(self.pos), self.msg) + +class NoMoreTokens(Exception): + """Another exception object, for when we run out of tokens""" + pass + +class Token(object): + """Yapps token. + + This is a container for a scanned token. + """ + + def __init__(self, type,value, pos=None): + """Initialize a token.""" + self.type = type + self.value = value + self.pos = pos + + def __repr__(self): + output = '<%s: %s' % (self.type, repr(self.value)) + if self.pos: + output += " @ " + if self.pos[0]: + output += "%s:" % self.pos[0] + if self.pos[1]: + output += "%d" % self.pos[1] + if self.pos[2] is not None: + output += ".%d" % self.pos[2] + output += ">" + return output + +in_name=0 +class Scanner(object): + """Yapps scanner. + + The Yapps scanner can work in context sensitive or context + insensitive modes. The token(i) method is used to retrieve the + i-th token. It takes a restrict set that limits the set of tokens + it is allowed to return. In context sensitive mode, this restrict + set guides the scanner. In context insensitive mode, there is no + restriction (the set is always the full set of tokens). + + """ + + def __init__(self, patterns, ignore, input="", + file=None,filename=None,stacked=False): + """Initialize the scanner. + + Parameters: + patterns : [(terminal, uncompiled regex), ...] or None + ignore : {terminal:None, ...} + input : string + + If patterns is None, we assume that the subclass has + defined self.patterns : [(terminal, compiled regex), ...]. + Note that the patterns parameter expects uncompiled regexes, + whereas the self.patterns field expects compiled regexes. + + The 'ignore' value is either None or a callable, which is called + with the scanner and the to-be-ignored match object; this can + be used for include file or comment handling. + """ + + if not filename: + global in_name + filename="<f.%d>" % in_name + in_name += 1 + + self.input = input + self.ignore = ignore + self.file = file + self.filename = filename + self.pos = 0 + self.del_pos = 0 # skipped + self.line = 1 + self.del_line = 0 # skipped + self.col = 0 + self.tokens = [] + self.stack = None + self.stacked = stacked + + self.last_read_token = None + self.last_token = None + self.last_types = None + + if patterns is not None: + # Compile the regex strings into regex objects + self.patterns = [] + for terminal, regex in patterns: + self.patterns.append( (terminal, re.compile(regex)) ) + + def stack_input(self, input="", file=None, filename=None): + """Temporarily parse from a second file.""" + + # Already reading from somewhere else: Go on top of that, please. + if self.stack: + # autogenerate a recursion-level-identifying filename + if not filename: + filename = 1 + else: + try: + filename += 1 + except TypeError: + pass + # now pass off to the include file + self.stack.stack_input(input,file,filename) + else: + + try: + filename += 0 + except TypeError: + pass + else: + filename = "<str_%d>" % filename + +# self.stack = object.__new__(self.__class__) +# Scanner.__init__(self.stack,self.patterns,self.ignore,input,file,filename, stacked=True) + + # Note that the pattern+ignore are added by the generated + # scanner code + self.stack = self.__class__(input,file,filename, stacked=True) + + def get_pos(self): + """Return a file/line/char tuple.""" + if self.stack: return self.stack.get_pos() + + return (self.filename, self.line+self.del_line, self.col) + +# def __repr__(self): +# """Print the last few tokens that have been scanned in""" +# output = '' +# for t in self.tokens: +# output += '%s\n' % (repr(t),) +# return output + + def print_line_with_pointer(self, pos, length=0, out=sys.stderr): + """Print the line of 'text' that includes position 'p', + along with a second line with a single caret (^) at position p""" + + file,line,p = pos + if file != self.filename: + if self.stack: return self.stack.print_line_with_pointer(pos,length=length,out=out) + print >>out, "(%s: not in input buffer)" % file + return + + text = self.input + p += length-1 # starts at pos 1 + + origline=line + line -= self.del_line + spos=0 + if line > 0: + while 1: + line = line - 1 + try: + cr = text.index("\n",spos) + except ValueError: + if line: + text = "" + break + if line == 0: + text = text[spos:cr] + break + spos = cr+1 + else: + print >>out, "(%s:%d not in input buffer)" % (file,origline) + return + + # Now try printing part of the line + text = text[max(p-80, 0):p+80] + p = p - max(p-80, 0) + + # Strip to the left + i = text[:p].rfind('\n') + j = text[:p].rfind('\r') + if i < 0 or (0 <= j < i): i = j + if 0 <= i < p: + p = p - i - 1 + text = text[i+1:] + + # Strip to the right + i = text.find('\n', p) + j = text.find('\r', p) + if i < 0 or (0 <= j < i): i = j + if i >= 0: + text = text[:i] + + # Now shorten the text + while len(text) > 70 and p > 60: + # Cut off 10 chars + text = "..." + text[10:] + p = p - 7 + + # Now print the string, along with an indicator + print >>out, '> ',text + print >>out, '> ',' '*p + '^' + + def grab_input(self): + """Get more input if possible.""" + if not self.file: return + if len(self.input) - self.pos >= MIN_WINDOW: return + + data = self.file.read(MIN_WINDOW) + if data is None or data == "": + self.file = None + + # Drop bytes from the start, if necessary. + if self.pos > 2*MIN_WINDOW: + self.del_pos += MIN_WINDOW + self.del_line += self.input[:MIN_WINDOW].count("\n") + self.pos -= MIN_WINDOW + self.input = self.input[MIN_WINDOW:] + data + else: + self.input = self.input + data + + def getchar(self): + """Return the next character.""" + self.grab_input() + + c = self.input[self.pos] + self.pos += 1 + return c + + def token(self, restrict, context=None): + """Scan for another token.""" + + while 1: + if self.stack: + try: + return self.stack.token(restrict, context) + except StopIteration: + self.stack = None + + # Keep looking for a token, ignoring any in self.ignore + self.grab_input() + + # special handling for end-of-file + if self.stacked and self.pos==len(self.input): + raise StopIteration + + # Search the patterns for the longest match, with earlier + # tokens in the list having preference + best_match = -1 + best_pat = '(error)' + best_m = None + for p, regexp in self.patterns: + # First check to see if we're ignoring this token + if restrict and p not in restrict and p not in self.ignore: + continue + m = regexp.match(self.input, self.pos) + if m and m.end()-m.start() > best_match: + # We got a match that's better than the previous one + best_pat = p + best_match = m.end()-m.start() + best_m = m + + # If we didn't find anything, raise an error + if best_pat == '(error)' and best_match < 0: + msg = 'Bad Token' + if restrict: + msg = 'Trying to find one of '+', '.join(restrict) + raise SyntaxError(self.get_pos(), msg, context=context) + + ignore = best_pat in self.ignore + value = self.input[self.pos:self.pos+best_match] + if not ignore: + tok=Token(type=best_pat, value=value, pos=self.get_pos()) + + self.pos += best_match + + npos = value.rfind("\n") + if npos > -1: + self.col = best_match-npos + self.line += value.count("\n") + else: + self.col += best_match + + # If we found something that isn't to be ignored, return it + if not ignore: + if len(self.tokens) >= 10: + del self.tokens[0] + self.tokens.append(tok) + self.last_read_token = tok + # print repr(tok) + return tok + else: + ignore = self.ignore[best_pat] + if ignore: + ignore(self, best_m) + + def peek(self, *types, **kw): + """Returns the token type for lookahead; if there are any args + then the list of args is the set of token types to allow""" + context = kw.get("context",None) + if self.last_token is None: + self.last_types = types + self.last_token = self.token(types,context) + elif self.last_types: + for t in types: + if t not in self.last_types: + raise NotImplementedError("Unimplemented: restriction set changed") + return self.last_token.type + + def scan(self, type, **kw): + """Returns the matched text, and moves to the next token""" + context = kw.get("context",None) + + if self.last_token is None: + tok = self.token([type],context) + else: + if self.last_types and type not in self.last_types: + raise NotImplementedError("Unimplemented: restriction set changed") + + tok = self.last_token + self.last_token = None + if tok.type != type: + if not self.last_types: self.last_types=[] + raise SyntaxError(tok.pos, 'Trying to find '+type+': '+ ', '.join(self.last_types)+", got "+tok.type, context=context) + return tok.value + +class Parser(object): + """Base class for Yapps-generated parsers. + + """ + + def __init__(self, scanner): + self._scanner = scanner + + def _stack(self, input="",file=None,filename=None): + """Temporarily read from someplace else""" + self._scanner.stack_input(input,file,filename) + self._tok = None + + def _peek(self, *types, **kw): + """Returns the token type for lookahead; if there are any args + then the list of args is the set of token types to allow""" + return self._scanner.peek(*types, **kw) + + def _scan(self, type, **kw): + """Returns the matched text, and moves to the next token""" + return self._scanner.scan(type, **kw) + +class Context(object): + """Class to represent the parser's call stack. + + Every rule creates a Context that links to its parent rule. The + contexts can be used for debugging. + + """ + + def __init__(self, parent, scanner, rule, args=()): + """Create a new context. + + Args: + parent: Context object or None + scanner: Scanner object + rule: string (name of the rule) + args: tuple listing parameters to the rule + + """ + self.parent = parent + self.scanner = scanner + self.rule = rule + self.args = args + while scanner.stack: scanner = scanner.stack + self.token = scanner.last_read_token + + def __str__(self): + output = '' + if self.parent: output = str(self.parent) + ' > ' + output += self.rule + return output + +def print_error(err, scanner, max_ctx=None): + """Print error messages, the parser stack, and the input text -- for human-readable error messages.""" + # NOTE: this function assumes 80 columns :-( + # Figure out the line number + pos = err.pos + if not pos: + pos = scanner.get_pos() + + file_name, line_number, column_number = pos + print >>sys.stderr, '%s:%d:%d: %s' % (file_name, line_number, column_number, err.msg) + + scanner.print_line_with_pointer(pos) + + context = err.context + token = None + while context: + print >>sys.stderr, 'while parsing %s%s:' % (context.rule, tuple(context.args)) + if context.token: + token = context.token + if token: + scanner.print_line_with_pointer(token.pos, length=len(token.value)) + context = context.parent + if max_ctx: + max_ctx = max_ctx-1 + if not max_ctx: + break + +def wrap_error_reporter(parser, rule, *args,**kw): + try: + return getattr(parser, rule)(*args,**kw) + except SyntaxError, e: + print_error(e, parser._scanner) + except NoMoreTokens: + print >>sys.stderr, 'Could not complete parsing; stopped around here:' + print >>sys.stderr, parser._scanner diff --git a/yapps2.py b/yapps2.py new file mode 100755 index 0000000..d6fd101 --- /dev/null +++ b/yapps2.py @@ -0,0 +1,113 @@ +#!/usr/bin/python + +# +# Yapps 2 - yet another python parser system +# Copyright 1999-2003 by Amit J. Patel <amitp@cs.stanford.edu> +# +# This version of Yapps 2 can be distributed under the +# terms of the MIT open source license, either found in the LICENSE file +# included with the Yapps distribution +# <http://theory.stanford.edu/~amitp/yapps/> or at +# <http://www.opensource.org/licenses/mit-license.php> +# + +import sys, re + +from yapps import runtime, parsetree + +def generate(inputfilename, outputfilename='', dump=0, **flags): + """Generate a grammar, given an input filename (X.g) + and an output filename (defaulting to X.py).""" + + if not outputfilename: + if inputfilename.endswith('.g'): + outputfilename = inputfilename[:-2] + '.py' + else: + raise Exception('Must specify output filename if input filename is not *.g') + + DIVIDER = '\n%%\n' # This pattern separates the pre/post parsers + preparser, postparser = None, None # Code before and after the parser desc + + # Read the entire file + s = open(inputfilename,'r').read() + + # See if there's a separation between the pre-parser and parser + f = s.find(DIVIDER) + if f >= 0: preparser, s = s[:f]+'\n\n', s[f+len(DIVIDER):] + + # See if there's a separation between the parser and post-parser + f = s.find(DIVIDER) + if f >= 0: s, postparser = s[:f], '\n\n'+s[f+len(DIVIDER):] + + # Create the parser and scanner and parse the text + scanner = grammar.ParserDescriptionScanner(s, filename=inputfilename) + if preparser: scanner.del_line += preparser.count('\n') + + parser = grammar.ParserDescription(scanner) + t = runtime.wrap_error_reporter(parser, 'Parser') + if t is None: return 1 # Failure + if preparser is not None: t.preparser = preparser + if postparser is not None: t.postparser = postparser + + # Check the options + for f in t.options.keys(): + for opt,_,_ in yapps_options: + if f == opt: break + else: + print >>sys.stderr, 'Warning: unrecognized option', f + # Add command line options to the set + for f in flags.keys(): t.options[f] = flags[f] + + # Generate the output + if dump: + t.dump_information() + else: + t.output = open(outputfilename, 'w') + t.generate_output() + return 0 + +if __name__ == '__main__': + import doctest + doctest.testmod(sys.modules['__main__']) + doctest.testmod(parsetree) + + # Someday I will use optparse, but Python 2.3 is too new at the moment. + yapps_options = [ + ('context-insensitive-scanner', + 'context-insensitive-scanner', + 'Scan all tokens (see docs)'), + ] + + import getopt + optlist, args = getopt.getopt(sys.argv[1:], 'f:', ['help', 'dump', 'use-devel-grammar']) + if not args or len(args) > 2: + print >>sys.stderr, 'Usage:' + print >>sys.stderr, ' python', sys.argv[0], '[flags] input.g [output.py]' + print >>sys.stderr, 'Flags:' + print >>sys.stderr, (' --dump' + ' '*40)[:35] + 'Dump out grammar information' + print >>sys.stderr, (' --use-devel-grammar' + ' '*40)[:35] + 'Use the devel grammar parser from yapps_grammar.py instead of the stable grammar from grammar.py' + for flag, _, doc in yapps_options: + print >>sys.stderr, (' -f' + flag + ' '*40)[:35] + doc + else: + # Read in the options and create a list of flags + flags = {} + use_devel_grammar = 0 + for opt in optlist: + for flag, name, _ in yapps_options: + if opt == ('-f', flag): + flags[name] = 1 + break + else: + if opt == ('--dump', ''): + flags['dump'] = 1 + elif opt == ('--use-devel-grammar', ''): + use_devel_grammar = 1 + else: + print >>sys.stderr, 'Warning: unrecognized option', opt[0], opt[1] + + if use_devel_grammar: + import yapps_grammar as grammar + else: + from yapps import grammar + + sys.exit(generate(*tuple(args), **flags)) diff --git a/yapps_grammar.g b/yapps_grammar.g new file mode 100644 index 0000000..cd21a9e --- /dev/null +++ b/yapps_grammar.g @@ -0,0 +1,121 @@ +# grammar.py, part of Yapps 2 - yet another python parser system +# Copyright 1999-2003 by Amit J. Patel <amitp@cs.stanford.edu> +# Enhancements copyright 2003-2004 by Matthias Urlichs <smurf@debian.org> +# +# This version of the Yapps 2 grammar can be distributed under the +# terms of the MIT open source license, either found in the LICENSE +# file included with the Yapps distribution +# <http://theory.stanford.edu/~amitp/yapps/> or at +# <http://www.opensource.org/licenses/mit-license.php> +# + +"""Parser for Yapps grammars. + +This file defines the grammar of Yapps grammars. Naturally, it is +implemented in Yapps. The grammar.py module needed by Yapps is built +by running Yapps on yapps_grammar.g. (Holy circularity, Batman!) + +""" + +import sys, re +from yapps import parsetree + +###################################################################### +def cleanup_choice(rule, lst): + if len(lst) == 0: return Sequence(rule, []) + if len(lst) == 1: return lst[0] + return parsetree.Choice(rule, *tuple(lst)) + +def cleanup_sequence(rule, lst): + if len(lst) == 1: return lst[0] + return parsetree.Sequence(rule, *tuple(lst)) + +def resolve_name(rule, tokens, id, args): + if id in [x[0] for x in tokens]: + # It's a token + if args: + print 'Warning: ignoring parameters on TOKEN %s<<%s>>' % (id, args) + return parsetree.Terminal(rule, id) + else: + # It's a name, so assume it's a nonterminal + return parsetree.NonTerminal(rule, id, args) + +%% +parser ParserDescription: + + ignore: "[ \t\r\n]+" + ignore: "#.*?\r?\n" + token EOF: "$" + token ATTR: "<<.+?>>" + token STMT: "{{.+?}}" + token ID: '[a-zA-Z_][a-zA-Z_0-9]*' + token STR: '[rR]?\'([^\\n\'\\\\]|\\\\.)*\'|[rR]?"([^\\n"\\\\]|\\\\.)*"' + token LP: '\\(' + token RP: '\\)' + token LB: '\\[' + token RB: '\\]' + token OR: '[|]' + token STAR: '[*]' + token PLUS: '[+]' + token QUEST: '[?]' + token COLON: ':' + + rule Parser: "parser" ID ":" + Options + Tokens + Rules<<Tokens>> + EOF + {{ return parsetree.Generator(ID,Options,Tokens,Rules) }} + + rule Options: {{ opt = {} }} + ( "option" ":" Str {{ opt[Str] = 1 }} )* + {{ return opt }} + + rule Tokens: {{ tok = [] }} + ( + "token" ID ":" Str {{ tok.append( (ID,Str) ) }} + | "ignore" + ":" Str {{ ign = ('#ignore',Str) }} + ( STMT {{ ign = ign + (STMT[2:-2],) }} )? + {{ tok.append( ign ) }} + )* + {{ return tok }} + + rule Rules<<tokens>>: + {{ rul = [] }} + ( + "rule" ID OptParam ":" ClauseA<<ID, tokens>> + {{ rul.append( (ID, OptParam, ClauseA) ) }} + )* + {{ return rul }} + + rule ClauseA<<rule, tokens>>: + ClauseB<<rule,tokens>> + {{ v = [ClauseB] }} + ( OR ClauseB<<rule,tokens>> {{ v.append(ClauseB) }} )* + {{ return cleanup_choice(rule,v) }} + + rule ClauseB<<rule,tokens>>: + {{ v = [] }} + ( ClauseC<<rule,tokens>> {{ v.append(ClauseC) }} )* + {{ return cleanup_sequence(rule, v) }} + + rule ClauseC<<rule,tokens>>: + ClauseD<<rule,tokens>> + ( PLUS {{ return parsetree.Plus(rule, ClauseD) }} + | STAR {{ return parsetree.Star(rule, ClauseD) }} + | QUEST {{ return parsetree.Option(rule, ClauseD) }} + | {{ return ClauseD }} ) + + rule ClauseD<<rule,tokens>>: + STR {{ t = (STR, eval(STR,{},{})) }} + {{ if t not in tokens: tokens.insert( 0, t ) }} + {{ return parsetree.Terminal(rule, STR) }} + | ID OptParam {{ return resolve_name(rule,tokens, ID, OptParam) }} + | LP ClauseA<<rule,tokens>> RP {{ return ClauseA }} + | LB ClauseA<<rule,tokens>> RB {{ return parsetree.Option(rule, ClauseA) }} + | STMT {{ return parsetree.Eval(rule, STMT[2:-2]) }} + + rule OptParam: [ ATTR {{ return ATTR[2:-2] }} ] {{ return '' }} + rule Str: STR {{ return eval(STR,{},{}) }} +%% |