Skip to content

Commit a5b1b62

Browse files
mawiesnerzo1
authored andcommitted
OPENNLP-1226 Training an NER model for dates with 'dd.mm.yyyy' as Date format
- adds NameFinderMEWithDatesTest verifying German (and English) date formats, via custom training data per language, as reproducer for OpenNLP-1226 - adds RandomGermanNewsGenerator to generate synthetic news corpora with dates annotated, typical for DE locale - adds RandomEnglishNewsGenerator to generate synthetic news corpora with dates annotated, typical for EN,US/UK/AUS locale - extracts code to AbstractNameFinderTest to avoid cnp of re-usable code
1 parent f112107 commit a5b1b62

File tree

10 files changed

+21002
-103
lines changed

10 files changed

+21002
-103
lines changed
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package opennlp.tools.cmdline.namefind.generator;
19+
20+
import java.text.SimpleDateFormat;
21+
import java.util.Calendar;
22+
import java.util.Date;
23+
import java.util.GregorianCalendar;
24+
import java.util.Locale;
25+
import java.util.Random;
26+
import java.util.concurrent.ThreadLocalRandom;
27+
28+
abstract class AbstractNewsGenerator {
29+
30+
protected Calendar cal = new GregorianCalendar();
31+
32+
abstract String[] getSupportedDateFormats();
33+
34+
Date generateRandomDate(Calendar cal) {
35+
cal.set(1900, Calendar.JANUARY, 1);
36+
long startMillis = cal.getTimeInMillis();
37+
long endMillis = new Date().getTime();
38+
long randomMillisSinceEpoch = ThreadLocalRandom.current().nextLong(startMillis, endMillis);
39+
return new Date(randomMillisSinceEpoch);
40+
}
41+
42+
String formatDateWithTags(Date date, Locale loc) {
43+
String[] formats = getSupportedDateFormats();
44+
SimpleDateFormat dateFormat = new SimpleDateFormat(formats[new Random().nextInt(formats.length)], loc);
45+
return dateFormat.format(date);
46+
}
47+
}
Lines changed: 272 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,272 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package opennlp.tools.cmdline.namefind.generator;
19+
20+
import java.io.BufferedWriter;
21+
import java.io.FileWriter;
22+
import java.io.IOException;
23+
import java.util.Locale;
24+
import java.util.Random;
25+
26+
import org.slf4j.Logger;
27+
import org.slf4j.LoggerFactory;
28+
29+
/**
30+
* Generates randomly created sentences for the English language
31+
* to compile a text corpus for training purposes.
32+
* <p>
33+
* Covers famous books and movies, recent sport events, music highlights, and events
34+
* from politics and science, distributed across the last 100 years.
35+
* <p>
36+
* Dates are formatted as {@code <START:date> ... <END>} in different English date format variations.
37+
*
38+
* @implNote Event template phrases generated by AI (MS Copilot).
39+
*
40+
* @see opennlp.tools.namefind.NameFinderMEWithDatesTest
41+
*/
42+
public class RandomEnglishNewsGenerator extends AbstractNewsGenerator {
43+
44+
private static final Logger logger = LoggerFactory.getLogger(RandomEnglishNewsGenerator.class);
45+
46+
private static final String START_DATE = "<START:date> ";
47+
private static final String END = " <END>";
48+
49+
// Origin: https://en.wikipedia.org/wiki/List_of_date_formats_by_country
50+
private static final String[] DATE_FORMATS = {
51+
"d MMMM yyyy", // UK
52+
"dd/mm/yyyy", // UK
53+
"m/d/yy", // US
54+
"m/d/yyyy", // US
55+
"yyyy-mm-dd", // US
56+
"MMMM d, yyyy", // Australia
57+
"EEEE, dd/mm/yyyy", // UK with name of day
58+
"EEEE, yyyy-mm-dd", // US with name of day
59+
"MMMM yyyy", // month name + year
60+
};
61+
62+
private static final String[] SPORTS_EVENTS = {
63+
"the legendary boxing match between Joe Louis and Max Schmeling took place.",
64+
"the Olympic Games were held in Rome.",
65+
"the ‘Miracle of Bern’ occurred, when the German national football team won the World Cup.",
66+
"the Winter Olympics were held in Lake Placid.",
67+
"the Football World Cup took place in Italy.",
68+
"the Olympic Games were held in Atlanta.",
69+
"the FIFA World Cup was held in Brazil",
70+
"the Olympic Games took place in Tokyo",
71+
"the European Football Championship was held in several European cities.",
72+
"the Winter Olympics were held in Beijing.",
73+
"the FIFA World Cup was held in Qatar.",
74+
"the Olympic Games will take place in Paris.",
75+
"the Brazilian national football team won the World Cup.",
76+
"Usain Bolt set a new world record in the 100-meter sprint.",
77+
"Serena Williams won her 23rd Grand Slam title.",
78+
"the NBA championship final took place, with the Los Angeles Lakers taking the title.",
79+
"Michael Phelps won his 23rd Olympic gold medal.",
80+
"the Wimbledon final took place, with Roger Federer winning his eighth title.",
81+
"the German national football team won the European Championship.",
82+
"the Super Bowl final took place, with the New England Patriots emerging victorious.",
83+
"Rafael Nadal won his 13th French Open title.",
84+
"the UEFA Champions League final took place, with Real Madrid lifting the title."
85+
};
86+
87+
private static final String[] MUSIC_EVENTS = {
88+
"Elvis Presley's first single ‘That's All Right’ was released.",
89+
"the album ‘A Night at the Opera’ by Queen was released.",
90+
"the Woodstock Festival began.",
91+
"the album ‘Nevermind’ by Nirvana was released.",
92+
"the album ‘25’ by Adele was released.",
93+
"Michael Jackson, the ‘King of Pop’, died.",
94+
"the album ‘A Seat at the Table’ by Solange was released.",
95+
"the album ‘Montero’ by Lil Nas X was released.",
96+
"the album ‘Motomami’ by Rosalía was released.",
97+
"the album ‘Midnights’ by Taylor Swift was released.",
98+
"the album ‘Thriller’ by Michael Jackson was released.",
99+
"Pink Floyd's ‘The Dark Side of the Moon’ was released.",
100+
"The Beatles‘ ’Abbey Road‘ was released.",
101+
"AC/DC's ’Back in Black‘ was released.",
102+
"Fleetwood Mac's ’Rumours‘ was released.",
103+
"The Eagles’ ‘Hotel California’ was released.",
104+
"Bruce Springsteen's album ‘Born to Run’ was released.",
105+
"Prince's album ‘Purple Rain’.",
106+
"Pink Floyd's album ‘The Wall’.",
107+
"Led Zeppelin's album ‘Led Zeppelin IV’."
108+
};
109+
110+
private static final String[] CINEMA_EVENTS = {
111+
"the film ‘Gone with the Wind’ was released in cinemas.",
112+
"the film ‘Lawrence of Arabia’ was released.",
113+
"the film ‘Star Wars: A New Hope’ was released in cinemas.",
114+
"the film ‘Titanic’ was released.",
115+
"the film ‘Avatar’ was released in cinemas.",
116+
"the film ‘Frozen’ was released.",
117+
"the film ‘Thor: Ragnarok’ was released in cinemas.",
118+
"the film ‘The Godfather’ was released.",
119+
"the film ‘Schindler's List’ was released.",
120+
"the film ‘The Lord of the Rings: The Fellowship of the Ring’ was released in cinemas.",
121+
"the film ‘Forrest Gump’ was released.",
122+
"the film ‘The Lion King’ was released.",
123+
"the film ‘Jurassic Park’ was released in cinemas.",
124+
"the film ‘Inception’ was released.",
125+
"the film ‘The Matrix’ was released.",
126+
"the film ‘Gladiator’ was released in cinemas.",
127+
"the film ‘Pulp Fiction’ was released."
128+
};
129+
130+
private static final String[] ECONOMY_EVENTS = {
131+
"it was announced that the global economy had grown by 3.5%.",
132+
"an important agreement was signed to promote international cooperation in the field of economics.",
133+
"a major conference on global economic services took place, at which leading experts discussed " +
134+
"strategies for improving economic services.",
135+
"the introduction of the euro as a common currency in Europe was decided.",
136+
"the World Bank was established.",
137+
"the European Central Bank was established.",
138+
"the Bretton Woods Agreement was signed.",
139+
"the first meeting of the World Economic Forum was held in Davos.",
140+
"the World Trade Organization (WTO) was established.",
141+
"the Maastricht Treaty was signed, leading to the establishment of the European Union.",
142+
"the introduction of the Marshall Plan for economic reconstruction aid after the World War II.",
143+
"the decision was taken to establish the Organization for Economic Cooperation and Development (OECD).",
144+
"the first G20 summit took place."
145+
};
146+
147+
private static final String[] POLITICS_EVENTS = {
148+
"parliamentary elections took place in Germany.",
149+
"a new law was passed to promote digital education.",
150+
"an important agreement was signed to promote international cooperation in the field of politics.",
151+
"the Universal Declaration of Human Rights was adopted by the United Nations.",
152+
"the founding of the United Nations took place.",
153+
"the Treaty of Versailles was signed, ending the First World War.",
154+
"NATO was founded.",
155+
"the first session of the European Parliament took place",
156+
"the Lisbon Treaty was signed, reforming the European Union",
157+
"the Berlin Wall was built",
158+
"the reunification of Germany took place",
159+
"the Treaty of Rome was signed, leading to the establishment of the" +
160+
" European Economic Community",
161+
"the United Nations Charter was signed."
162+
};
163+
164+
private static final String[] SCIENCE_EVENTS = {
165+
"a significant breakthrough in quantum computing research was made",
166+
"a major scientific project to study climate change was launched",
167+
"a major scientific symposium was held on the latest developments" +
168+
" in genetic research",
169+
"the structure of DNA was discovered by James Watson and Francis Crick",
170+
"the Apollo 11 mission successfully landed on the moon",
171+
"the first artificial heart was successfully transplanted",
172+
"the Human Genome Project was completed",
173+
"the first cloned sheep, Dolly, was born",
174+
"the Large Hadron Collider was launched",
175+
"the first vaccine against COVID-19 was approved",
176+
"the first image of a black hole was published",
177+
"the first successful gene therapy was performed in a human",
178+
"the first space probe landed on Mars",
179+
"the existence of gravitational waves was detected",
180+
"the first artificial organ was successfully transplanted",
181+
"the first CRISPR gene editing was performed in a human",
182+
"the first all-electric aircraft was tested",
183+
"the first quantum communication via satellite was carried out",
184+
"the first successful stem cell therapy was performed in a human",
185+
"the first image of an exoplanet was published."
186+
};
187+
188+
private static final String[] SENTENCE_STARTS_WITH_DATE = {
189+
"On",
190+
"In",
191+
"On the",
192+
"During the",
193+
"At the start of",
194+
"Towards the end of",
195+
"In the middle of",
196+
"Shortly after",
197+
"Around",
198+
"During",
199+
"After",
200+
"Before",
201+
"Around"
202+
};
203+
204+
private static final String[] SENTENCE_STARTS_WITHOUT_DATE = {
205+
"As reported,",
206+
"According to an announcement,",
207+
"According to the latest reports,",
208+
"According to a statement,",
209+
"As reported by Reuters,",
210+
"According to the authorities,",
211+
"According to experts,"
212+
};
213+
214+
private static final int NUM_SENTENCES = 10000;
215+
216+
@Override
217+
String[] getSupportedDateFormats() {
218+
return DATE_FORMATS;
219+
}
220+
221+
public static void main(String[] args) {
222+
String outputFileName;
223+
if (args.length != 1) {
224+
outputFileName = "RandomNewsWithGeneratedDates_EN.train";
225+
} else {
226+
outputFileName = args[0];
227+
}
228+
229+
RandomEnglishNewsGenerator newsGen = new RandomEnglishNewsGenerator();
230+
try (BufferedWriter writer = new BufferedWriter(
231+
new FileWriter(outputFileName))) {
232+
for (int i = 0; i < NUM_SENTENCES; i++) {
233+
writer.write(newsGen.generateSentence());
234+
writer.newLine();
235+
}
236+
logger.info("Text corpus with '{}' sentences generated and written to: '{}'",
237+
NUM_SENTENCES, outputFileName);
238+
} catch (IOException e) {
239+
logger.error(e.getLocalizedMessage(), e);
240+
}
241+
}
242+
243+
private String generateSentence() {
244+
String[] eventTypes = {"sports", "music", "cinema", "economy", "politics", "science"};
245+
String eventType = eventTypes[new Random().nextInt(eventTypes.length)];
246+
247+
String event = switch (eventType) {
248+
case "sports" -> SPORTS_EVENTS[new Random().nextInt(SPORTS_EVENTS.length)];
249+
case "music" -> MUSIC_EVENTS[new Random().nextInt(MUSIC_EVENTS.length)];
250+
case "cinema" -> CINEMA_EVENTS[new Random().nextInt(CINEMA_EVENTS.length)];
251+
case "economy" -> ECONOMY_EVENTS[new Random().nextInt(ECONOMY_EVENTS.length)];
252+
case "politics" -> POLITICS_EVENTS[new Random().nextInt(POLITICS_EVENTS.length)];
253+
case "science" -> SCIENCE_EVENTS[new Random().nextInt(SCIENCE_EVENTS.length)];
254+
default -> "";
255+
};
256+
257+
Random random = new Random();
258+
String sentence;
259+
if (random.nextDouble() < 0.15) {
260+
String date = START_DATE + formatDateWithTags(generateRandomDate(cal), Locale.ENGLISH) + END;
261+
String sentenceStart = SENTENCE_STARTS_WITH_DATE[
262+
new Random().nextInt(SENTENCE_STARTS_WITH_DATE.length)];
263+
sentence = String.format("%s %s %s", sentenceStart, date, event);
264+
} else {
265+
String sentenceStart = SENTENCE_STARTS_WITHOUT_DATE[
266+
new Random().nextInt(SENTENCE_STARTS_WITHOUT_DATE.length)];
267+
sentence = String.format("%s %s", sentenceStart, event);
268+
}
269+
return sentence;
270+
}
271+
272+
}

0 commit comments

Comments
 (0)