View Javadoc

1   /***
2    Copyright (C) 2005 The Java Community
3   
4    This program is free software; you can redistribute it and/or modify  it under
5    the terms of the GNU General Public License as published by  the Free Software
6    Foundation; either version 2 of the License, or  (at your option) any later
7    version.
8   
9    This program is distributed in the hope that it will be useful,  but WITHOUT
10   ANY WARRANTY; without even the implied warranty of  MERCHANTABILITY or FITNESS
11   FOR A PARTICULAR PURPOSE. See the  GNU General Public License for more details.
12  
13   You should have received a copy of the GNU General Public License  along with
14   this program; if not, write to the Free Software  Foundation, Inc., 59 Temple
15   Place, Suite 330, Boston, MA 02111-1307 USA.
16   */
17  package org.bejug.javacareers.feeder;
18  
19  import java.util.ArrayList;
20  import java.util.Collections;
21  import java.util.Comparator;
22  import java.util.Iterator;
23  import java.util.LinkedList;
24  import java.util.List;
25  import java.util.StringTokenizer;
26  import java.util.Timer;
27  import java.util.TimerTask;
28  
29  import org.apache.commons.logging.Log;
30  import org.apache.commons.logging.LogFactory;
31  import org.bejug.javacareers.common.search.SearchCriteria;
32  import org.bejug.javacareers.feeder.model.JobList;
33  import org.bejug.javacareers.feeder.model.RssFeed;
34  import org.bejug.javacareers.feeder.parser.JobParser;
35  import org.bejug.javacareers.jobs.model.JobOffer;
36  import org.bejug.javacareers.jobs.model.User;
37  import org.springframework.dao.DataAccessException;
38  import org.springframework.orm.hibernate3.HibernateSystemException;
39  
40  /***
41   * Reads RSS files and stores the data in a datasource.
42   *
43   * @author Bavo Bruylandt (Last modified by $Author: shally $)
44   * @version $Revision: 1.6 $ - $Date: 2005/12/20 15:36:45 $
45   */
46  public class AggregatorFeederTask extends FeederTask {
47  
48      /***
49       * The class logger.
50       */
51      public static final Log LOG = LogFactory.getLog(AggregatorFeederTask.class);
52  
53      /***
54       * The default RSS publisher username.
55       */
56      private static final String PUBLISHER_USERNAME = "RssFeeder";
57  
58      /***
59       * The scan cycle in seconds.
60       */
61      private int scanCycle;
62  
63  
64      /***
65       * Publisher object
66       */
67      private User myPublisher;
68      
69      
70      /***
71       * keywordList.
72       */
73      private List keyWordList;
74  
75      /***
76       * Called by IoC container after constructor.
77       *
78       * @see /applicationContext-Feeder.xml
79       */
80      public void init() {
81          LOG.info("Debug: Creating AggregatorFeederTask");
82          FeederDaemonConfig config = getFeederDaemonConfig();
83          scanCycle = config.getGenerateCycle();
84          String keywords = getFeederDaemonConfig().getKeywords();
85          keyWordList = new LinkedList();
86          StringTokenizer st = new StringTokenizer(keywords, ",");
87          while (st.hasMoreTokens()) {
88              String token = st.nextToken().trim();
89              token = token.toUpperCase();
90              keyWordList.add(token);
91  
92          }
93          LOG.info("Debug: KeywordList is " + keyWordList);
94          LOG.info("Debug: Generatecycle: " + scanCycle);
95          filepath = config.getRssFilepath();
96          constructFeeder(config);
97  
98          try {
99              Timer timer = new Timer();
100             timer.schedule(new TimerTask() {
101                 public void run() {
102                     LOG.info("Debug: <<<<<<<<<<<< Aggregating feed at startup>>>>>>>>>>>>>>>>>>>");
103                     LOG.info("Debug: <<<<<<<<<<<< Processing feeds >>>>>>>>>>>>>>>>>>>");
104                     if (myPublisher == null) {
105                         myPublisher = getAdminService().getUserByUserName(PUBLISHER_USERNAME);
106                     }
107                     if (myPublisher != null) {
108                         ArrayList lists = processFeeds();
109                         LOG.info("Debug: Processed " + lists.size() + " RSS feeds");
110                         addListsToDataBase(lists);
111                         LOG.info("Debug: Added " + lists.size() + " RSS feeds");
112                     }
113 
114                 }
115             }, 60 * 1000);
116         } catch (IllegalStateException e) {
117             LOG.error("Init failed: " + e);
118         } catch (IllegalArgumentException e) {
119             LOG.error("Init failed: " + e);
120         }
121     }
122 
123     /***
124      * Starts the task in seperate thread.
125      */
126     public synchronized void run() {
127 
128         LOG.info("Debug: <<<<<<<<<<<< Processing feeds >>>>>>>>>>>>>>>>>>>");
129         if (myPublisher == null) {
130             myPublisher = getAdminService().getUserByUserName(PUBLISHER_USERNAME);
131         }
132         if (myPublisher != null) {
133             ArrayList lists = processFeeds();
134             LOG.info("Debug: Processed " + lists.size() + " RSS feeds");
135             addListsToDataBase(lists);
136             LOG.info("Debug: Added " + lists.size() + " RSS feeds");
137         } else {
138             LOG.info("Debug: Publisher for feeder " + PUBLISHER_USERNAME + " was not found. Not Scanning.");
139 
140         }
141 
142     }
143 
144     /***
145      * Process the feeds and returns JobLists.
146      *
147      * @return List of JobLists, one for each feed
148      */
149     public ArrayList processFeeds() {
150         ArrayList lists = new ArrayList();
151         JobParser jobParser = new JobParser();
152 
153         // Get rss feeds from db
154         List feeds = getRssFeedService().getRssFeeds();
155         LOG.info("Debug: Found " + feeds.size() + " Feed(s)");
156         for (int i = 0; i < feeds.size(); i++) {
157             RssFeed rssFeed = (RssFeed) feeds.get(i);
158             LOG.info("Debug: Parsing feed: " + rssFeed);
159 
160             // Parse the feed to a job list
161 
162             JobList list = null;
163             try {
164                 list = jobParser.parse(rssFeed);
165                 lists.add(list);
166             } catch (FeederException e) {
167                 LOG.error("Parse failed on : " + rssFeed.getUri() + " " + e);
168             }
169 
170         }
171         return lists;
172     }
173 
174     /***
175      * Adds the Joblists to the database.
176      *
177      * @param lists List of JobLists
178      */
179     public void addListsToDataBase(ArrayList lists) {
180         LOG.info("Debug: Showing lists");
181         LOG.info("Debug: lists = " + lists);
182         LOG.info("Debug: Adding lists to database (" + lists.size() + ")");
183 
184         for (int a = 0; a < lists.size(); a++) {
185             JobList list = (JobList) lists.get(a);
186             if (list != null) {
187                 LOG.info("Debug: Getting jobOfferbyURL " + list.getSource());
188                 SearchCriteria searchCriteria = getSearchCriteriaFactory().createSearchCriteria(JobOffer.class);
189                 searchCriteria.addEqualsCriterium("feedUrl", list.getSource());
190                 List searchList = null;
191                 try {
192                     searchList = getJobService().getJobOffers(searchCriteria);
193                 } catch (IllegalArgumentException e1) {
194                     LOG.info("Debug: Illegal argument:" + e1);
195                 }
196                 mergeAndUpdateLists(list, searchList);
197                 try {
198                     for (int i = 0; i < searchList.size(); i++) {
199                         JobOffer jobOffer = (JobOffer) searchList.get(i);
200                         getJobService().deleteJobOffer(jobOffer);
201 
202                     }
203                     //int deleted = getJobService().deleteJobOffersByUrl(list.getSource());
204                     //LOG.info("Debug: Delete success: " + deleted);
205                 } catch (HibernateSystemException e) {
206                     LOG.error("Delete error: " + e);
207                 }
208                 LOG.info("Debug: Done deleting");
209             }
210             for (int j = 0; j < list.size(); j++) {
211                 JobOffer offer = (JobOffer) list.get(j);
212                 if (isValid(offer, keyWordList)) {
213 
214                     LOG.info("Debug: ** setting publisher");
215 
216 
217                     offer.setUser(myPublisher);
218                     offer.setRssFeed(true);
219                     offer.setAdminApproved(true);
220 
221                     LOG.info("Debug: ** adding offer: " + offer.getTitle());
222                     LOG.info("Debug: ** id is: " + offer.getId());
223 
224                     try {
225                         getJobService().storeJobOffer(offer);
226                         LOG.info("Debug: ** id is: " + offer.getId());
227                     } catch (DataAccessException ex) {
228                         LOG.debug(ex);
229                     }
230 
231                     LOG.info("Debug: Added entry: " + offer.getDescription());
232                 }
233             }
234         }
235     }
236     
237     
238     /***
239      * merges and updates the lists.
240      * @param newList resulting list from the merge.
241      * @param oldList old list to update.
242      */
243     public void mergeAndUpdateLists(List newList, List oldList) {
244         Collections.sort(newList, byFeedURL);
245         Collections.sort(oldList, byFeedURL);
246         
247         for (Iterator it = newList.iterator(); it.hasNext();) {
248             JobOffer jobOffer = (JobOffer) it.next();
249             int search = Collections.binarySearch(oldList,jobOffer,byFeedURL);
250             LOG.info("Debug: Search returned: "+search+" for id "+jobOffer.getTitle()+ " for url "+jobOffer.getUrl());
251             if (search == 0) {
252                 oldList.remove(search);
253                 it.remove();
254 
255                 LOG.info("Debug: Removed duplicate from list: "+jobOffer.getId()+" - " +jobOffer.getTitle()+" - "+jobOffer.getUrl());
256             }
257 
258 
259         }
260         LOG.info("Debug: For removal: "+oldList);
261         LOG.info("Debug: For adding: "+newList);
262         //newList.addAll(oldList);
263 
264     }
265     
266     /***
267      * implementation of the comparator.
268      */
269     private Comparator byFeedURL = new Comparator() {
270 
271         /***
272          * Compare nmber of occurences of searched criteria.
273          *
274          * @param o1 row 1
275          * @param o2 row 2
276          * @return 0 if equal, ...
277          */
278         public int compare(Object o1, Object o2) {
279             JobOffer offer1 = (JobOffer) o1;
280             JobOffer offer2 = (JobOffer) o2;
281             int compare = offer1.getUrl().compareTo(offer2.getUrl());
282             LOG.info("Debug: Compare: "+ compare);
283             LOG.info("Debug: Url1: "+ offer1.getUrl()+ " Url2: "+offer2.getUrl());
284             return compare;
285 
286         }
287     };
288     
289     
290 
291     /***
292      * @return the comparator needed.
293      */
294     public Comparator getByFeedURL() {
295 		return byFeedURL;
296 	}
297 
298     /***
299      * @param byFeedURL the comparator.
300      */
301 	public void setByFeedURL(Comparator byFeedURL) {
302 		this.byFeedURL = byFeedURL;
303 	}
304 
305 	/***
306      * Checks wheter title or description contains matching keywords
307      * With a keyword like JAVA then: Java is true, Javascript is false
308      * Javaa/J2EE is true and Djava false
309      *
310      * @param offer    the offer to check
311      * @param keywords List
312      * @return true if valid on a keyword
313      */
314     boolean isValid(JobOffer offer, List keywords) {
315         String offerTitle = " " + offer.getTitle().toUpperCase() + " ";
316         String offerDesc = " " + offer.getDescription().toUpperCase() + " ";
317         boolean isValid = false;
318         for (int i = 0; i < keyWordList.size(); i++) {
319             String s = (String) keyWordList.get(i);
320 
321             int indTitle = offerTitle.indexOf(s);
322             int indDesc = offerDesc.indexOf(s);
323             // things get more complicated because with keyword Java:
324             // Java is true, Javascript is false but Java/foo should be true
325             if (indTitle != -1 || indDesc != -1) {
326                 if (indTitle != -1) {
327                     char c = offerTitle.charAt(indTitle + s.length());
328                     char d = offerTitle.charAt(indTitle - 1);
329                     if (!Character.isLetter(c) && !Character.isLetter(d)) {
330                         isValid = true;
331                     }
332 
333                 }
334                 if (indDesc != -1) {
335                     char c = offerDesc.charAt(indDesc + s.length());
336                     char d = offerDesc.charAt(indDesc - 1);
337                     if (!Character.isLetter(c) && !Character.isLetter(d)) {
338                         isValid = true;
339                     }
340 
341                 }
342             }
343         }
344         LOG.info("Debug:  IsValid: " + isValid + " - " + offer);
345         return isValid;
346 
347     }
348 
349 
350 }
351 
352 /***
353  * $Log: AggregatorFeederTask.java,v $
354  * Revision 1.6  2005/12/20 15:36:45  shally
355  * CheckStyle and PMD changes.
356  *
357  * Revision 1.5  2005/12/09 10:46:55  shally
358  * Opkuis voor checkstyle en PMD
359  *
360  * Revision 1.4  2005/09/30 14:38:08  bavo_jcs
361  * Fixed URL
362  *
363  * Revision 1.3  2005/09/19 16:15:19  schauwvliege
364  * Introduction of Approve items
365  *
366  * Revision 1.2  2005/09/13 08:11:17  schauwvliege
367  * organize imports
368  *
369  * Revision 1.1  2005/08/26 07:58:29  ge0ffrey
370  * split up the sources in service, serviceimpl and webclient
371  *
372  * Revision 1.24  2005/08/10 09:04:48  bavo_jcs
373  * Optimized imports according to checkstyle
374  *
375  * Revision 1.23  2005/08/09 12:59:54  bavo_jcs
376  * Optimized imports
377  *
378  * Revision 1.22  2005/08/08 12:08:02  bme_jcs
379  * resolved checkstyle errors
380  *
381  * Revision 1.21  2005/08/08 09:38:22  bme_jcs
382  * resolved checkstyle errors
383  *
384  * Revision 1.20  2005/08/05 14:21:07  bavo_jcs
385  * Feeder smart delete fix
386  *
387  * Revision 1.19  2005/08/05 09:50:10  bavo_jcs
388  * Deleted a system.out
389  *
390  * Revision 1.18  2005/08/05 08:27:55  bme_jcs
391  * resolved checkstyle errors
392  *
393  * Revision 1.17  2005/08/03 15:22:54  bavo_jcs
394  * Feeder Smart delete
395  *
396  * Revision 1.16  2005/08/02 15:36:35  bavo_jcs
397  * Feeder update improvement
398  *
399  * Revision 1.15  2005/07/20 15:07:27  bavo_jcs
400  * Feeder smart delete
401  *
402  * Revision 1.14  2005/07/07 14:55:13  bavo_jcs
403  * Ajax integration
404  *
405  * Revision 1.13  2005/07/05 14:46:01  schauwvliege
406  * Moved test data to AdminBootstrap
407  *
408  * Revision 1.12  2005/06/30 10:36:23  bavo_jcs
409  * change db init
410  *
411  * Revision 1.11  2005/06/30 10:33:02  bavo_jcs
412  * change db init
413  *
414  * Revision 1.10  2005/06/17 12:01:17  schauwvliege
415  * CheckStyle/ PMD
416  *
417  * Revision 1.9  2005/06/17 11:42:46  schauwvliege
418  * CheckStyle/ PMD
419  *
420  * Revision 1.8  2005/06/17 09:01:43  schauwvliege
421  * CheckStyle
422  *
423  * Revision 1.7  2005/06/14 13:40:05  schauwvliege
424  * Renamed add to store
425  *
426  * Revision 1.6  2005/06/14 12:05:52  schauwvliege
427  * CheckStyle and fixing tests
428  *
429  * Revision 1.5  2005/06/10 15:37:23  bavo_jcs
430  * int returned on delete
431  *
432  * Revision 1.4  2005/06/10 14:46:01  bavo_jcs
433  * web->webclient
434  *
435  * Revision 1.3  2005/06/10 13:27:20  bavo_jcs
436  * new version
437  *
438  * Revision 1.2  2005/06/09 08:18:43  bejug_cc
439  * Fix initial import
440  *
441  * Revision 1.18  2005/06/07 14:38:48  bbr
442  * Lucene highlightterms added
443  *
444  * Revision 1.17  2005/06/07 13:03:10  bbr
445  * Keyword filtering
446  *
447  * Revision 1.16  2005/06/06 15:47:24  bbr
448  * job source
449  *
450  * Revision 1.15  2005/06/05 12:25:04  sja
451  * Added sourceUrl.
452  *
453  * Revision 1.14  2005/06/03 09:44:09  bbr
454  * admin feed panel work
455  *
456  * Revision 1.13  2005/06/02 15:49:25  PSONG09
457  * modified company name of rss feed user
458  *
459  * Revision 1.12  2005/06/01 15:07:11  bbr
460  * RssFeed page
461  *
462  * Revision 1.11  2005/06/01 12:36:54  bbr
463  * RssFeedService
464  *
465  * Revision 1.10  2005/05/31 13:30:49  bbr
466  * reorganized contexts for tests
467  *
468  * Revision 1.9  2005/05/31 11:56:19  bbr
469  * deleted JobEntry
470  *
471  * Revision 1.8  2005/05/30 14:14:56  bbr
472  * servletcontext enabled
473  *
474  * Revision 1.7  2005/05/30 12:04:42  bbr
475  * using javacareersconfig
476  *
477  * Revision 1.6  2005/05/26 14:28:41  PSONG09
478  * integration with view
479  *
480  * Revision 1.5  2005/05/26 08:59:30  bbr
481  * split cron
482  * made tasks run at startup
483  *
484  * Revision 1.4  2005/05/25 15:25:04  bbr
485  * testdata
486  *
487  * Revision 1.3  2005/05/25 10:42:59  sja
488  * Removed default constructor and added javadoc to init method.
489  *
490  * Revision 1.2  2005/05/24 15:33:26  bbr
491  * Using spring sheduling
492  *
493  * Revision 1.1  2005/05/24 11:52:39  bbr
494  * Using spring sheduling
495  *
496  * Revision 1.1  2005/05/23 17:04:57  sja
497  * Moved to org.bejug.javacareers.feeder package.
498  *
499  * Revision 1.3  2005/05/23 15:33:12  bbr
500  * added weight to lucene
501  *
502  * Revision 1.2  2005/05/23 12:27:57  bbr
503  * no message
504  *
505  * Revision 1.1  2005/05/23 08:46:33  PSONG09
506  * added feeder source files to project
507  *
508  * Revision 1.8  2005/05/23 07:16:35  stephan_janssen
509  * Code cleanup.
510  *
511  * Revision 1.7  2005/05/22 16:57:02  stephan_janssen
512  * Replaced HibernateException with DataAccessException.
513  *
514  * Revision 1.6  2005/05/20 14:41:46  bavo_jcs
515  * minor changes
516  *
517  * Revision 1.5  2005/05/18 15:46:39  bavo_jcs
518  * -adeed lucene service
519  *
520  * Revision 1.4  2005/05/18 11:49:51  bavo_jcs
521  * no message
522  *
523  * Revision 1.3  2005/05/11 14:25:22  bavo_jcs
524  * - renamed main files
525  *
526  * Revision 1.2  2005/05/11 13:16:02  bavo_jcs
527  * - debugged thread
528  *
529  * Revision 1.1  2005/05/11 11:53:25  bavo_jcs
530  * refactored
531  * - conform to conventions
532  * - some javadoc
533  * - Added FeederTask design
534  *
535  */