diff options
-rw-r--r-- | BUGS | 6 | ||||
-rw-r--r-- | Makefile | 5 | ||||
-rw-r--r-- | htmtojargon.awk | 5 | ||||
-rw-r--r-- | removeetags.pl | 4 | ||||
-rw-r--r-- | separateheadmeta.sed | 2 | ||||
-rw-r--r-- | xmlcleanup2.sed | 6 |
6 files changed, 12 insertions, 16 deletions
@@ -1,9 +1,3 @@ -There is a bug causing some words to not be included at all. Cowboy and chicken are examples. - -Word type (noun, adj, verb, etc.) markings after headword are included in the index, so searching for e.g. "A" or "n" is impossible as it returns all adjectives or nouns. - Many of the IPA and other non-ascii symbols aren't yet converted to unicode, and are just discarded. Many (but not all) of these symbols are documented in files in Help/ from the original CDs. Search results are not necessarily returned in order. This is probably unavoidable with the dict protocol. - -grep TODO symbols *sed *awk Makefile @@ -11,9 +11,10 @@ all: oed.jargon oed.dict.dz .htm.jargon: iconv -f ISO-8859-1 -t UTF-8 < "$<" \ + | sed 's/\r//g' \ | awk -f htmtojargon.awk \ - | sed -f separateheadmeta.sed \ | sed -f symbols.sed \ + | perl -p removeetags.pl \ | sed -f xmlcleanup.sed \ | sed -f xmlcleanup2.sed \ > "$@" @@ -28,7 +29,7 @@ oed.dict: oed.jargon cat $< \ | dictfmt -j --utf8 \ --columns 0 --headword-separator ',' \ - --index-data-separator ';' \ + --index-data-separator " " \ -u http://njw.me.uk/oed \ -s "Oxford English Dictionary, 2nd Edition" \ oed diff --git a/htmtojargon.awk b/htmtojargon.awk index 16c5356..323b7ee 100644 --- a/htmtojargon.awk +++ b/htmtojargon.awk @@ -1,5 +1,6 @@ #!/usr/bin/awk -f # dirty xml reading is more fun +# requires nawk / gawk for sub() BEGIN { FS = "</hg>" @@ -8,6 +9,10 @@ BEGIN { { printdefs($1, $2); } function printdefs(word, defs) { + # split the headwords and other information about the word, + # to be used in conjuction with dictfmt's --index-data-separator + sub("<i>", " ", word); + n = split(defs, array, "<def>"); for(i=0; i<=n; i++) { if(array[i] != "") { diff --git a/removeetags.pl b/removeetags.pl new file mode 100644 index 0000000..8de7875 --- /dev/null +++ b/removeetags.pl @@ -0,0 +1,4 @@ +# <e> tags seem to duplicate other preceeding tags, so remove them +# +# sed doesn't support non-greedy matching, so we're using perl +s/<e>.*?<\/e>//g diff --git a/separateheadmeta.sed b/separateheadmeta.sed deleted file mode 100644 index 9dca6c6..0000000 --- a/separateheadmeta.sed +++ /dev/null @@ -1,2 +0,0 @@ -# this replaces the first <i> in a headword with "; " -/^:[^:]*:/ s/<i>/; / diff --git a/xmlcleanup2.sed b/xmlcleanup2.sed index 5352a92..56e66b0 100644 --- a/xmlcleanup2.sed +++ b/xmlcleanup2.sed @@ -1,12 +1,6 @@ #!/bin/sed -f # sed rules that must run after the main set -# <e> tags seem to duplicate <v> tags for some quotes -# NOTE this assumes e tags only come at end of lines -s/<e>.*$//g -# TODO: make this work instead of the above -#s/<e>.*?<\/e>//g - # any xml tags not processed can just go away s/<[^>]*>//g # any xml character entities not processed can just go away |