From 5d0c5974e2655a6f66153bffb67c9346c2c4a589 Mon Sep 17 00:00:00 2001 From: Nick White Date: Sun, 26 May 2013 21:52:41 +0100 Subject: Correct e tag removal, correct headword separation --- BUGS | 6 ------ Makefile | 5 +++-- htmtojargon.awk | 5 +++++ removeetags.pl | 4 ++++ separateheadmeta.sed | 2 -- xmlcleanup2.sed | 6 ------ 6 files changed, 12 insertions(+), 16 deletions(-) create mode 100644 removeetags.pl delete mode 100644 separateheadmeta.sed diff --git a/BUGS b/BUGS index 1426aab..7358117 100644 --- a/BUGS +++ b/BUGS @@ -1,9 +1,3 @@ -There is a bug causing some words to not be included at all. Cowboy and chicken are examples. - -Word type (noun, adj, verb, etc.) markings after headword are included in the index, so searching for e.g. "A" or "n" is impossible as it returns all adjectives or nouns. - Many of the IPA and other non-ascii symbols aren't yet converted to unicode, and are just discarded. Many (but not all) of these symbols are documented in files in Help/ from the original CDs. Search results are not necessarily returned in order. This is probably unavoidable with the dict protocol. - -grep TODO symbols *sed *awk Makefile diff --git a/Makefile b/Makefile index 10c4fb3..cfb4484 100644 --- a/Makefile +++ b/Makefile @@ -11,9 +11,10 @@ all: oed.jargon oed.dict.dz .htm.jargon: iconv -f ISO-8859-1 -t UTF-8 < "$<" \ + | sed 's/\r//g' \ | awk -f htmtojargon.awk \ - | sed -f separateheadmeta.sed \ | sed -f symbols.sed \ + | perl -p removeetags.pl \ | sed -f xmlcleanup.sed \ | sed -f xmlcleanup2.sed \ > "$@" @@ -28,7 +29,7 @@ oed.dict: oed.jargon cat $< \ | dictfmt -j --utf8 \ --columns 0 --headword-separator ',' \ - --index-data-separator ';' \ + --index-data-separator " " \ -u http://njw.me.uk/oed \ -s "Oxford English Dictionary, 2nd Edition" \ oed diff --git a/htmtojargon.awk b/htmtojargon.awk index 16c5356..323b7ee 100644 --- a/htmtojargon.awk +++ b/htmtojargon.awk @@ -1,5 +1,6 @@ #!/usr/bin/awk -f # dirty xml reading is more fun +# requires nawk / gawk for sub() BEGIN { FS = "" @@ -8,6 +9,10 @@ BEGIN { { printdefs($1, $2); } function printdefs(word, defs) { + # split the headwords and other information about the word, + # to be used in conjuction with dictfmt's --index-data-separator + sub("", " ", word); + n = split(defs, array, ""); for(i=0; i<=n; i++) { if(array[i] != "") { diff --git a/removeetags.pl b/removeetags.pl new file mode 100644 index 0000000..8de7875 --- /dev/null +++ b/removeetags.pl @@ -0,0 +1,4 @@ +# tags seem to duplicate other preceeding tags, so remove them +# +# sed doesn't support non-greedy matching, so we're using perl +s/.*?<\/e>//g diff --git a/separateheadmeta.sed b/separateheadmeta.sed deleted file mode 100644 index 9dca6c6..0000000 --- a/separateheadmeta.sed +++ /dev/null @@ -1,2 +0,0 @@ -# this replaces the first in a headword with "; " -/^:[^:]*:/ s//; / diff --git a/xmlcleanup2.sed b/xmlcleanup2.sed index 5352a92..56e66b0 100644 --- a/xmlcleanup2.sed +++ b/xmlcleanup2.sed @@ -1,12 +1,6 @@ #!/bin/sed -f # sed rules that must run after the main set -# tags seem to duplicate tags for some quotes -# NOTE this assumes e tags only come at end of lines -s/.*$//g -# TODO: make this work instead of the above -#s/.*?<\/e>//g - # any xml tags not processed can just go away s/<[^>]*>//g # any xml character entities not processed can just go away -- cgit v1.2.3