-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDatasets.hs
More file actions
61 lines (49 loc) · 2.47 KB
/
Datasets.hs
File metadata and controls
61 lines (49 loc) · 2.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
{-# LANGUAGE OverloadedStrings #-}
module Datasets where
import System.Path.Glob
import qualified Data.ByteString.Char8 as BS
import Data.Char (isAlpha, isUpper)
import Data.List
import Data.Maybe
import Data.Ord
import qualified NLP.Data
import NLP.Data (Annotated(..),NamedSegmentation(..))
import NLP.Tokenizer
-- The Contours data set.
load_ds_contours = do
ds_contours_rybesh <- NLP.Data.contours "/mnt/work/textseg/data/u-series"
-- NB: remove rybesh's segs, since they don't cover all documents in the set
let ds_contours = map (\d -> d {
segmentation = filter (\s -> segname s /= "annotators:rybesh") (segmentation d)}) ds_contours_rybesh
return ds_contours
-- The Contours data set plus the docsouth reference segmentations (only of those documents).
load_ds_merged = do
ds_contours <- load_ds_contours
ds_docsouth <- NLP.Data.contours "/mnt/work/textseg/data/docsouth"
-- add docsouth reference segmentations to all present contours documents
let ds_merged = zipWith (\d1 d2 -> Annotated {
name = name d1,
document = document d1,
segmentation = segmentation d1 ++ segmentation d2 })
(sortBy (comparing name) (filter (\d -> name d `elem` map name ds_contours) ds_docsouth))
(sortBy (comparing name) ds_contours)
return ds_merged
-- The ds_merged dataset, excluding documents for which the docsouth "segmentation" consists of only one segment.
load_ds = do
ds <- load_ds_merged
return $ filter (\(Annotated _ _ ss) -> length (segseg (fromJust (find ((=="annotators:docsouth").segname) ss))) > 1) ds
load_training_set = do
filenames <- glob "/mnt/work/textseg/data/U. The*/*/*"
texts <- mapM BS.readFile filenames
return $ map (dropSpeakerNames . map newlineToSentenceBreak . tokenize) texts
where
newlineToSentenceBreak (Whitespace "\n") = SentenceBreak "\n"
newlineToSentenceBreak other = other
-- Drops a line/sentence if all words in it are uppercase and the last punctuation is a colon.
dropSpeakerNames :: [Token] -> [Token]
dropSpeakerNames = concat . filter (not . isSpeakerName) . splitAtToken' (\t -> isSentenceBreak t || t == Whitespace "\n")
where isSpeakerName toks =
(isJust (find isPunctuation toks) `implies` (tokenText (last (filter isPunctuation toks)) == ":"))
&& all (BS.all (\c -> isAlpha c `implies` isUpper c)) [w | Word w <- toks]
implies :: Bool -> Bool -> Bool
implies p q = if p then q else True