diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d4fed7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,110 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# vim swap +*.swp + +# macOS +.DS_Store diff --git a/code/__init__.pyc b/code/__init__.pyc deleted file mode 100644 index 49ae2ee..0000000 Binary files a/code/__init__.pyc and /dev/null differ diff --git a/code/buildtree.pyc b/code/buildtree.pyc deleted file mode 100644 index 8255e84..0000000 Binary files a/code/buildtree.pyc and /dev/null differ diff --git a/code/data.pyc b/code/data.pyc deleted file mode 100644 index 161dd54..0000000 Binary files a/code/data.pyc and /dev/null differ diff --git a/code/datastructure.pyc b/code/datastructure.pyc deleted file mode 100644 index 9489c68..0000000 Binary files a/code/datastructure.pyc and /dev/null differ diff --git a/code/docreader.pyc b/code/docreader.pyc deleted file mode 100644 index b73c791..0000000 Binary files a/code/docreader.pyc and /dev/null differ diff --git a/code/evalparser.pyc b/code/evalparser.pyc deleted file mode 100644 index cc48286..0000000 Binary files a/code/evalparser.pyc and /dev/null differ diff --git a/code/evaluation.pyc b/code/evaluation.pyc deleted file mode 100644 index 218ef54..0000000 Binary files a/code/evaluation.pyc and /dev/null differ diff --git a/code/featselection.pyc b/code/featselection.pyc deleted file mode 100644 index b7de999..0000000 Binary files a/code/featselection.pyc and /dev/null differ diff --git a/code/feature.pyc b/code/feature.pyc deleted file mode 100644 index fa4acf9..0000000 Binary files a/code/feature.pyc and /dev/null differ diff --git a/code/model.pyc b/code/model.pyc deleted file mode 100644 index d418332..0000000 Binary files a/code/model.pyc and /dev/null differ diff --git a/code/parser.pyc b/code/parser.pyc deleted file mode 100644 index 64c48d2..0000000 Binary files a/code/parser.pyc and /dev/null differ diff --git a/code/readdoc.pyc b/code/readdoc.pyc deleted file mode 100644 index 1f10c67..0000000 Binary files a/code/readdoc.pyc and /dev/null differ diff --git a/code/tree.pyc b/code/tree.pyc deleted file mode 100644 index 96c8b2c..0000000 Binary files a/code/tree.pyc and /dev/null differ diff --git a/code/util.pyc b/code/util.pyc deleted file mode 100644 index b5732db..0000000 Binary files a/code/util.pyc and /dev/null differ diff --git a/corenlp.sh b/corenlp.sh index cfc3c5d..13f566d 100755 --- a/corenlp.sh +++ b/corenlp.sh @@ -2,19 +2,17 @@ # # Runs Stanford CoreNLP. # Simple uses for xml and plain text output to files are: -# ./corenlp.sh -file filename -# ./corenlp.sh -file filename -outputFormat text +# ./corenlp.sh 8g /path/to/target_dir -scriptdir=`dirname $0` +scriptdir="stanford-corenlp" # echo java -mx3g -cp \"$scriptdir/*\" edu.stanford.nlp.pipeline.StanfordCoreNLP $* # $1 - path -PATH=$1 +JAVA_XMX=$1 +PATH=$2 for FNAME in $PATH/* do - /usr/bin/java -mx2g -cp "$scriptdir/*" edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse -ssplit.eolonly -tokenize.whitespace true -file $FNAME - # /usr/bin/java -mx2g -cp "$scriptdir/*" edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse -file $FNAME - /bin/mv $(/usr/bin/basename $FNAME.xml) $PATH/ + /usr/bin/java -Xmx$JAVA_XMX -cp "$scriptdir/*" edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse -file $FNAME -outputFormat xml -outputDirectory $PATH done diff --git a/discoseg/__init__.pyc b/discoseg/__init__.pyc deleted file mode 100644 index d8c3cb7..0000000 Binary files a/discoseg/__init__.pyc and /dev/null differ diff --git a/discoseg/buildedu.pyc b/discoseg/buildedu.pyc deleted file mode 100644 index 5e27367..0000000 Binary files a/discoseg/buildedu.pyc and /dev/null differ diff --git a/discoseg/model/__init__.pyc b/discoseg/model/__init__.pyc deleted file mode 100644 index b6e6248..0000000 Binary files a/discoseg/model/__init__.pyc and /dev/null differ diff --git a/discoseg/model/classifier.pyc b/discoseg/model/classifier.pyc deleted file mode 100644 index fed9056..0000000 Binary files a/discoseg/model/classifier.pyc and /dev/null differ diff --git a/discoseg/model/datastruct.pyc b/discoseg/model/datastruct.pyc deleted file mode 100644 index a6905ff..0000000 Binary files a/discoseg/model/datastruct.pyc and /dev/null differ diff --git a/discoseg/model/docreader.pyc b/discoseg/model/docreader.pyc deleted file mode 100644 index 3e2580a..0000000 Binary files a/discoseg/model/docreader.pyc and /dev/null differ diff --git a/discoseg/model/feature.pyc b/discoseg/model/feature.pyc deleted file mode 100644 index dc7ac84..0000000 Binary files a/discoseg/model/feature.pyc and /dev/null differ diff --git a/discoseg/model/sample.pyc b/discoseg/model/sample.pyc deleted file mode 100644 index 6a4d9ac..0000000 Binary files a/discoseg/model/sample.pyc and /dev/null differ diff --git a/discoseg/model/util.pyc b/discoseg/model/util.pyc deleted file mode 100644 index 640b515..0000000 Binary files a/discoseg/model/util.pyc and /dev/null differ diff --git a/preprocess/__init__.pyc b/preprocess/__init__.pyc deleted file mode 100644 index ed13daa..0000000 Binary files a/preprocess/__init__.pyc and /dev/null differ diff --git a/preprocess/xmlreader.py b/preprocess/xmlreader.py index 2d63fb5..b1d537f 100644 --- a/preprocess/xmlreader.py +++ b/preprocess/xmlreader.py @@ -142,6 +142,8 @@ def combineparse2sent(sent, parse): partialparse = parselist[tidx].replace(' ','') partialparse = partialparse.encode("ascii", "ignore") word = tokenlist[tidx].replace(' ','') + if word == '(' or word == ')': + word = sent.tokenlist[tidx].pos # print word, partialparse if (word + ')') in partialparse: tidx += 1 diff --git a/preprocess/xmlreader.pyc b/preprocess/xmlreader.pyc deleted file mode 100644 index 23a7b24..0000000 Binary files a/preprocess/xmlreader.pyc and /dev/null differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9f786da --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +click==7.1.2 +joblib==0.14.1 +nltk==3.4.1 +numpy==1.16.6 +regex==2021.7.6 +scikit-learn==0.20.4 +scipy==1.2.3 +singledispatch==3.6.2 +six==1.16.0 +tqdm==4.61.2