1 /***
2 Copyright (C) 2005 The Java Community
3
4 This program is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free Software
6 Foundation; either version 2 of the License, or (at your option) any later
7 version.
8
9 This program is distributed in the hope that it will be useful, but WITHOUT
10 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License along with
14 this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 Place, Suite 330, Boston, MA 02111-1307 USA.
16 */
17 package org.bejug.javacareers.feeder;
18
19 import java.util.ArrayList;
20 import java.util.Collections;
21 import java.util.Comparator;
22 import java.util.Iterator;
23 import java.util.LinkedList;
24 import java.util.List;
25 import java.util.StringTokenizer;
26 import java.util.Timer;
27 import java.util.TimerTask;
28
29 import org.apache.commons.logging.Log;
30 import org.apache.commons.logging.LogFactory;
31 import org.bejug.javacareers.common.search.SearchCriteria;
32 import org.bejug.javacareers.feeder.model.JobList;
33 import org.bejug.javacareers.feeder.model.RssFeed;
34 import org.bejug.javacareers.feeder.parser.JobParser;
35 import org.bejug.javacareers.jobs.model.JobOffer;
36 import org.bejug.javacareers.jobs.model.User;
37 import org.springframework.dao.DataAccessException;
38 import org.springframework.orm.hibernate3.HibernateSystemException;
39
40 /***
41 * Reads RSS files and stores the data in a datasource.
42 *
43 * @author Bavo Bruylandt (Last modified by $Author: shally $)
44 * @version $Revision: 1.6 $ - $Date: 2005/12/20 15:36:45 $
45 */
46 public class AggregatorFeederTask extends FeederTask {
47
48 /***
49 * The class logger.
50 */
51 public static final Log LOG = LogFactory.getLog(AggregatorFeederTask.class);
52
53 /***
54 * The default RSS publisher username.
55 */
56 private static final String PUBLISHER_USERNAME = "RssFeeder";
57
58 /***
59 * The scan cycle in seconds.
60 */
61 private int scanCycle;
62
63
64 /***
65 * Publisher object
66 */
67 private User myPublisher;
68
69
70 /***
71 * keywordList.
72 */
73 private List keyWordList;
74
75 /***
76 * Called by IoC container after constructor.
77 *
78 * @see /applicationContext-Feeder.xml
79 */
80 public void init() {
81 LOG.info("Debug: Creating AggregatorFeederTask");
82 FeederDaemonConfig config = getFeederDaemonConfig();
83 scanCycle = config.getGenerateCycle();
84 String keywords = getFeederDaemonConfig().getKeywords();
85 keyWordList = new LinkedList();
86 StringTokenizer st = new StringTokenizer(keywords, ",");
87 while (st.hasMoreTokens()) {
88 String token = st.nextToken().trim();
89 token = token.toUpperCase();
90 keyWordList.add(token);
91
92 }
93 LOG.info("Debug: KeywordList is " + keyWordList);
94 LOG.info("Debug: Generatecycle: " + scanCycle);
95 filepath = config.getRssFilepath();
96 constructFeeder(config);
97
98 try {
99 Timer timer = new Timer();
100 timer.schedule(new TimerTask() {
101 public void run() {
102 LOG.info("Debug: <<<<<<<<<<<< Aggregating feed at startup>>>>>>>>>>>>>>>>>>>");
103 LOG.info("Debug: <<<<<<<<<<<< Processing feeds >>>>>>>>>>>>>>>>>>>");
104 if (myPublisher == null) {
105 myPublisher = getAdminService().getUserByUserName(PUBLISHER_USERNAME);
106 }
107 if (myPublisher != null) {
108 ArrayList lists = processFeeds();
109 LOG.info("Debug: Processed " + lists.size() + " RSS feeds");
110 addListsToDataBase(lists);
111 LOG.info("Debug: Added " + lists.size() + " RSS feeds");
112 }
113
114 }
115 }, 60 * 1000);
116 } catch (IllegalStateException e) {
117 LOG.error("Init failed: " + e);
118 } catch (IllegalArgumentException e) {
119 LOG.error("Init failed: " + e);
120 }
121 }
122
123 /***
124 * Starts the task in seperate thread.
125 */
126 public synchronized void run() {
127
128 LOG.info("Debug: <<<<<<<<<<<< Processing feeds >>>>>>>>>>>>>>>>>>>");
129 if (myPublisher == null) {
130 myPublisher = getAdminService().getUserByUserName(PUBLISHER_USERNAME);
131 }
132 if (myPublisher != null) {
133 ArrayList lists = processFeeds();
134 LOG.info("Debug: Processed " + lists.size() + " RSS feeds");
135 addListsToDataBase(lists);
136 LOG.info("Debug: Added " + lists.size() + " RSS feeds");
137 } else {
138 LOG.info("Debug: Publisher for feeder " + PUBLISHER_USERNAME + " was not found. Not Scanning.");
139
140 }
141
142 }
143
144 /***
145 * Process the feeds and returns JobLists.
146 *
147 * @return List of JobLists, one for each feed
148 */
149 public ArrayList processFeeds() {
150 ArrayList lists = new ArrayList();
151 JobParser jobParser = new JobParser();
152
153
154 List feeds = getRssFeedService().getRssFeeds();
155 LOG.info("Debug: Found " + feeds.size() + " Feed(s)");
156 for (int i = 0; i < feeds.size(); i++) {
157 RssFeed rssFeed = (RssFeed) feeds.get(i);
158 LOG.info("Debug: Parsing feed: " + rssFeed);
159
160
161
162 JobList list = null;
163 try {
164 list = jobParser.parse(rssFeed);
165 lists.add(list);
166 } catch (FeederException e) {
167 LOG.error("Parse failed on : " + rssFeed.getUri() + " " + e);
168 }
169
170 }
171 return lists;
172 }
173
174 /***
175 * Adds the Joblists to the database.
176 *
177 * @param lists List of JobLists
178 */
179 public void addListsToDataBase(ArrayList lists) {
180 LOG.info("Debug: Showing lists");
181 LOG.info("Debug: lists = " + lists);
182 LOG.info("Debug: Adding lists to database (" + lists.size() + ")");
183
184 for (int a = 0; a < lists.size(); a++) {
185 JobList list = (JobList) lists.get(a);
186 if (list != null) {
187 LOG.info("Debug: Getting jobOfferbyURL " + list.getSource());
188 SearchCriteria searchCriteria = getSearchCriteriaFactory().createSearchCriteria(JobOffer.class);
189 searchCriteria.addEqualsCriterium("feedUrl", list.getSource());
190 List searchList = null;
191 try {
192 searchList = getJobService().getJobOffers(searchCriteria);
193 } catch (IllegalArgumentException e1) {
194 LOG.info("Debug: Illegal argument:" + e1);
195 }
196 mergeAndUpdateLists(list, searchList);
197 try {
198 for (int i = 0; i < searchList.size(); i++) {
199 JobOffer jobOffer = (JobOffer) searchList.get(i);
200 getJobService().deleteJobOffer(jobOffer);
201
202 }
203
204
205 } catch (HibernateSystemException e) {
206 LOG.error("Delete error: " + e);
207 }
208 LOG.info("Debug: Done deleting");
209 }
210 for (int j = 0; j < list.size(); j++) {
211 JobOffer offer = (JobOffer) list.get(j);
212 if (isValid(offer, keyWordList)) {
213
214 LOG.info("Debug: ** setting publisher");
215
216
217 offer.setUser(myPublisher);
218 offer.setRssFeed(true);
219 offer.setAdminApproved(true);
220
221 LOG.info("Debug: ** adding offer: " + offer.getTitle());
222 LOG.info("Debug: ** id is: " + offer.getId());
223
224 try {
225 getJobService().storeJobOffer(offer);
226 LOG.info("Debug: ** id is: " + offer.getId());
227 } catch (DataAccessException ex) {
228 LOG.debug(ex);
229 }
230
231 LOG.info("Debug: Added entry: " + offer.getDescription());
232 }
233 }
234 }
235 }
236
237
238 /***
239 * merges and updates the lists.
240 * @param newList resulting list from the merge.
241 * @param oldList old list to update.
242 */
243 public void mergeAndUpdateLists(List newList, List oldList) {
244 Collections.sort(newList, byFeedURL);
245 Collections.sort(oldList, byFeedURL);
246
247 for (Iterator it = newList.iterator(); it.hasNext();) {
248 JobOffer jobOffer = (JobOffer) it.next();
249 int search = Collections.binarySearch(oldList,jobOffer,byFeedURL);
250 LOG.info("Debug: Search returned: "+search+" for id "+jobOffer.getTitle()+ " for url "+jobOffer.getUrl());
251 if (search == 0) {
252 oldList.remove(search);
253 it.remove();
254
255 LOG.info("Debug: Removed duplicate from list: "+jobOffer.getId()+" - " +jobOffer.getTitle()+" - "+jobOffer.getUrl());
256 }
257
258
259 }
260 LOG.info("Debug: For removal: "+oldList);
261 LOG.info("Debug: For adding: "+newList);
262
263
264 }
265
266 /***
267 * implementation of the comparator.
268 */
269 private Comparator byFeedURL = new Comparator() {
270
271 /***
272 * Compare nmber of occurences of searched criteria.
273 *
274 * @param o1 row 1
275 * @param o2 row 2
276 * @return 0 if equal, ...
277 */
278 public int compare(Object o1, Object o2) {
279 JobOffer offer1 = (JobOffer) o1;
280 JobOffer offer2 = (JobOffer) o2;
281 int compare = offer1.getUrl().compareTo(offer2.getUrl());
282 LOG.info("Debug: Compare: "+ compare);
283 LOG.info("Debug: Url1: "+ offer1.getUrl()+ " Url2: "+offer2.getUrl());
284 return compare;
285
286 }
287 };
288
289
290
291 /***
292 * @return the comparator needed.
293 */
294 public Comparator getByFeedURL() {
295 return byFeedURL;
296 }
297
298 /***
299 * @param byFeedURL the comparator.
300 */
301 public void setByFeedURL(Comparator byFeedURL) {
302 this.byFeedURL = byFeedURL;
303 }
304
305 /***
306 * Checks wheter title or description contains matching keywords
307 * With a keyword like JAVA then: Java is true, Javascript is false
308 * Javaa/J2EE is true and Djava false
309 *
310 * @param offer the offer to check
311 * @param keywords List
312 * @return true if valid on a keyword
313 */
314 boolean isValid(JobOffer offer, List keywords) {
315 String offerTitle = " " + offer.getTitle().toUpperCase() + " ";
316 String offerDesc = " " + offer.getDescription().toUpperCase() + " ";
317 boolean isValid = false;
318 for (int i = 0; i < keyWordList.size(); i++) {
319 String s = (String) keyWordList.get(i);
320
321 int indTitle = offerTitle.indexOf(s);
322 int indDesc = offerDesc.indexOf(s);
323
324
325 if (indTitle != -1 || indDesc != -1) {
326 if (indTitle != -1) {
327 char c = offerTitle.charAt(indTitle + s.length());
328 char d = offerTitle.charAt(indTitle - 1);
329 if (!Character.isLetter(c) && !Character.isLetter(d)) {
330 isValid = true;
331 }
332
333 }
334 if (indDesc != -1) {
335 char c = offerDesc.charAt(indDesc + s.length());
336 char d = offerDesc.charAt(indDesc - 1);
337 if (!Character.isLetter(c) && !Character.isLetter(d)) {
338 isValid = true;
339 }
340
341 }
342 }
343 }
344 LOG.info("Debug: IsValid: " + isValid + " - " + offer);
345 return isValid;
346
347 }
348
349
350 }
351
352 /***
353 * $Log: AggregatorFeederTask.java,v $
354 * Revision 1.6 2005/12/20 15:36:45 shally
355 * CheckStyle and PMD changes.
356 *
357 * Revision 1.5 2005/12/09 10:46:55 shally
358 * Opkuis voor checkstyle en PMD
359 *
360 * Revision 1.4 2005/09/30 14:38:08 bavo_jcs
361 * Fixed URL
362 *
363 * Revision 1.3 2005/09/19 16:15:19 schauwvliege
364 * Introduction of Approve items
365 *
366 * Revision 1.2 2005/09/13 08:11:17 schauwvliege
367 * organize imports
368 *
369 * Revision 1.1 2005/08/26 07:58:29 ge0ffrey
370 * split up the sources in service, serviceimpl and webclient
371 *
372 * Revision 1.24 2005/08/10 09:04:48 bavo_jcs
373 * Optimized imports according to checkstyle
374 *
375 * Revision 1.23 2005/08/09 12:59:54 bavo_jcs
376 * Optimized imports
377 *
378 * Revision 1.22 2005/08/08 12:08:02 bme_jcs
379 * resolved checkstyle errors
380 *
381 * Revision 1.21 2005/08/08 09:38:22 bme_jcs
382 * resolved checkstyle errors
383 *
384 * Revision 1.20 2005/08/05 14:21:07 bavo_jcs
385 * Feeder smart delete fix
386 *
387 * Revision 1.19 2005/08/05 09:50:10 bavo_jcs
388 * Deleted a system.out
389 *
390 * Revision 1.18 2005/08/05 08:27:55 bme_jcs
391 * resolved checkstyle errors
392 *
393 * Revision 1.17 2005/08/03 15:22:54 bavo_jcs
394 * Feeder Smart delete
395 *
396 * Revision 1.16 2005/08/02 15:36:35 bavo_jcs
397 * Feeder update improvement
398 *
399 * Revision 1.15 2005/07/20 15:07:27 bavo_jcs
400 * Feeder smart delete
401 *
402 * Revision 1.14 2005/07/07 14:55:13 bavo_jcs
403 * Ajax integration
404 *
405 * Revision 1.13 2005/07/05 14:46:01 schauwvliege
406 * Moved test data to AdminBootstrap
407 *
408 * Revision 1.12 2005/06/30 10:36:23 bavo_jcs
409 * change db init
410 *
411 * Revision 1.11 2005/06/30 10:33:02 bavo_jcs
412 * change db init
413 *
414 * Revision 1.10 2005/06/17 12:01:17 schauwvliege
415 * CheckStyle/ PMD
416 *
417 * Revision 1.9 2005/06/17 11:42:46 schauwvliege
418 * CheckStyle/ PMD
419 *
420 * Revision 1.8 2005/06/17 09:01:43 schauwvliege
421 * CheckStyle
422 *
423 * Revision 1.7 2005/06/14 13:40:05 schauwvliege
424 * Renamed add to store
425 *
426 * Revision 1.6 2005/06/14 12:05:52 schauwvliege
427 * CheckStyle and fixing tests
428 *
429 * Revision 1.5 2005/06/10 15:37:23 bavo_jcs
430 * int returned on delete
431 *
432 * Revision 1.4 2005/06/10 14:46:01 bavo_jcs
433 * web->webclient
434 *
435 * Revision 1.3 2005/06/10 13:27:20 bavo_jcs
436 * new version
437 *
438 * Revision 1.2 2005/06/09 08:18:43 bejug_cc
439 * Fix initial import
440 *
441 * Revision 1.18 2005/06/07 14:38:48 bbr
442 * Lucene highlightterms added
443 *
444 * Revision 1.17 2005/06/07 13:03:10 bbr
445 * Keyword filtering
446 *
447 * Revision 1.16 2005/06/06 15:47:24 bbr
448 * job source
449 *
450 * Revision 1.15 2005/06/05 12:25:04 sja
451 * Added sourceUrl.
452 *
453 * Revision 1.14 2005/06/03 09:44:09 bbr
454 * admin feed panel work
455 *
456 * Revision 1.13 2005/06/02 15:49:25 PSONG09
457 * modified company name of rss feed user
458 *
459 * Revision 1.12 2005/06/01 15:07:11 bbr
460 * RssFeed page
461 *
462 * Revision 1.11 2005/06/01 12:36:54 bbr
463 * RssFeedService
464 *
465 * Revision 1.10 2005/05/31 13:30:49 bbr
466 * reorganized contexts for tests
467 *
468 * Revision 1.9 2005/05/31 11:56:19 bbr
469 * deleted JobEntry
470 *
471 * Revision 1.8 2005/05/30 14:14:56 bbr
472 * servletcontext enabled
473 *
474 * Revision 1.7 2005/05/30 12:04:42 bbr
475 * using javacareersconfig
476 *
477 * Revision 1.6 2005/05/26 14:28:41 PSONG09
478 * integration with view
479 *
480 * Revision 1.5 2005/05/26 08:59:30 bbr
481 * split cron
482 * made tasks run at startup
483 *
484 * Revision 1.4 2005/05/25 15:25:04 bbr
485 * testdata
486 *
487 * Revision 1.3 2005/05/25 10:42:59 sja
488 * Removed default constructor and added javadoc to init method.
489 *
490 * Revision 1.2 2005/05/24 15:33:26 bbr
491 * Using spring sheduling
492 *
493 * Revision 1.1 2005/05/24 11:52:39 bbr
494 * Using spring sheduling
495 *
496 * Revision 1.1 2005/05/23 17:04:57 sja
497 * Moved to org.bejug.javacareers.feeder package.
498 *
499 * Revision 1.3 2005/05/23 15:33:12 bbr
500 * added weight to lucene
501 *
502 * Revision 1.2 2005/05/23 12:27:57 bbr
503 * no message
504 *
505 * Revision 1.1 2005/05/23 08:46:33 PSONG09
506 * added feeder source files to project
507 *
508 * Revision 1.8 2005/05/23 07:16:35 stephan_janssen
509 * Code cleanup.
510 *
511 * Revision 1.7 2005/05/22 16:57:02 stephan_janssen
512 * Replaced HibernateException with DataAccessException.
513 *
514 * Revision 1.6 2005/05/20 14:41:46 bavo_jcs
515 * minor changes
516 *
517 * Revision 1.5 2005/05/18 15:46:39 bavo_jcs
518 * -adeed lucene service
519 *
520 * Revision 1.4 2005/05/18 11:49:51 bavo_jcs
521 * no message
522 *
523 * Revision 1.3 2005/05/11 14:25:22 bavo_jcs
524 * - renamed main files
525 *
526 * Revision 1.2 2005/05/11 13:16:02 bavo_jcs
527 * - debugged thread
528 *
529 * Revision 1.1 2005/05/11 11:53:25 bavo_jcs
530 * refactored
531 * - conform to conventions
532 * - some javadoc
533 * - Added FeederTask design
534 *
535 */