public class DeDuplicator
extends org.archive.modules.Processor
implements org.springframework.beans.factory.InitializingBean
Will determine if CrawlURIs are duplicates.
Duplicate detection can only be performed after the fetch processors have run.
Modifier and Type | Field and Description |
---|---|
static String |
ATTR_ANALYZE_TIMESTAMP |
static String |
ATTR_EQUIVALENT |
static String |
ATTR_FILTER_MODE |
static String |
ATTR_JUMP_TO |
static String |
ATTR_MIME_FILTER |
static String |
ATTR_ORIGIN |
static String |
ATTR_ORIGIN_HANDLING |
static String |
ATTR_STATS_PER_HOST |
static String |
ATTR_USE_SPARSE_RANGE_FILTER |
static String |
DEFAULT_MIME_FILTER |
static OriginHandling |
DEFAULT_ORIGIN_HANDLING |
protected boolean |
lookupByURL |
protected HashMap<String,is.landsbokasafn.deduplicator.Statistics> |
perHostStats |
protected org.apache.lucene.search.IndexSearcher |
searcher |
protected org.archive.modules.net.ServerCache |
serverCache |
protected is.landsbokasafn.deduplicator.Statistics |
stats |
protected boolean |
statsPerHost |
protected boolean |
useOrigin |
protected boolean |
useOriginFromIndex |
Constructor and Description |
---|
DeDuplicator() |
Modifier and Type | Method and Description |
---|---|
void |
afterPropertiesSet() |
protected void |
doAnalysis(org.archive.modules.CrawlURI curi,
is.landsbokasafn.deduplicator.Statistics currHostStats,
boolean isDuplicate) |
protected void |
doTimestampAnalysis(org.archive.modules.CrawlURI curi,
org.apache.lucene.document.Document urlHit,
is.landsbokasafn.deduplicator.Statistics currHostStats,
boolean isDuplicate) |
protected void |
finalTasks() |
boolean |
getAnalyzeTimestamp() |
boolean |
getBlacklist() |
String |
getIndexLocation() |
String |
getJumpTo() |
is.landsbokasafn.deduplicator.MatchingMethod |
getMatchingMethod() |
String |
getMimeFilter() |
String |
getOrigin() |
OriginHandling |
getOriginHandling() |
protected static String |
getPercentage(double portion,
double total) |
org.archive.modules.net.ServerCache |
getServerCache() |
boolean |
getStatsPerHost() |
boolean |
getTryEquivalent() |
boolean |
getUseSparseRengeFilter() |
protected void |
innerProcess(org.archive.modules.CrawlURI puri) |
protected org.archive.modules.ProcessResult |
innerProcessResult(org.archive.modules.CrawlURI curi) |
protected org.apache.lucene.document.Document |
lookupByDigest(org.archive.modules.CrawlURI curi,
is.landsbokasafn.deduplicator.Statistics currHostStats)
Process a CrawlURI looking up in the index by content digest
|
protected org.apache.lucene.document.Document |
lookupByURL(org.archive.modules.CrawlURI curi,
is.landsbokasafn.deduplicator.Statistics currHostStats)
Process a CrawlURI looking up in the index by URL
|
protected org.apache.lucene.search.Query |
queryField(String fieldName,
String value)
Run a simple Lucene query for a single term in a single field.
|
String |
report() |
void |
setAnalyzeTimestamp(boolean analyzeTimestamp) |
void |
setBlacklist(boolean blacklist) |
void |
setIndexLocation(String indexLocation) |
void |
setJumpTo(String jumpTo) |
void |
setMatchingMethod(is.landsbokasafn.deduplicator.MatchingMethod matchinMethod) |
void |
setMimeFilter(String mimeFilter) |
void |
setOrigin(String origin) |
void |
setOriginHandling(OriginHandling originHandling) |
void |
setServerCache(org.archive.modules.net.ServerCache serverCache) |
void |
setStatsPerHost(boolean statsPerHost) |
void |
setTryEquivalent(boolean tryEquivalent) |
void |
setUseSparseRengeFilter(boolean useSparseRengeFilter) |
protected boolean |
shouldProcess(org.archive.modules.CrawlURI curi) |
doCheckpoint, finishCheckpoint, flattenVia, fromCheckpointJson, getBeanName, getEnabled, getKeyedProperties, getRecordedSize, getShouldProcessRule, getURICount, hasHttpAuthenticationCredential, innerRejectProcess, isRunning, isSuccess, process, setBeanName, setEnabled, setRecoveryCheckpoint, setShouldProcessRule, start, startCheckpoint, stop, toCheckpointJson
public static final String ATTR_JUMP_TO
public static final String ATTR_ORIGIN
public static final String ATTR_EQUIVALENT
public static final String ATTR_MIME_FILTER
public static final String DEFAULT_MIME_FILTER
public static final String ATTR_FILTER_MODE
public static final String ATTR_ANALYZE_TIMESTAMP
public static final String ATTR_STATS_PER_HOST
public static final String ATTR_USE_SPARSE_RANGE_FILTER
public static final String ATTR_ORIGIN_HANDLING
public static final OriginHandling DEFAULT_ORIGIN_HANDLING
protected org.archive.modules.net.ServerCache serverCache
protected org.apache.lucene.search.IndexSearcher searcher
protected boolean lookupByURL
protected boolean statsPerHost
protected boolean useOrigin
protected boolean useOriginFromIndex
protected is.landsbokasafn.deduplicator.Statistics stats
public String getIndexLocation()
public void setIndexLocation(String indexLocation)
public is.landsbokasafn.deduplicator.MatchingMethod getMatchingMethod()
public void setMatchingMethod(is.landsbokasafn.deduplicator.MatchingMethod matchinMethod)
public String getJumpTo()
public void setJumpTo(String jumpTo)
public String getOrigin()
public void setOrigin(String origin)
public boolean getTryEquivalent()
public void setTryEquivalent(boolean tryEquivalent)
public String getMimeFilter()
public void setMimeFilter(String mimeFilter)
public boolean getBlacklist()
public void setBlacklist(boolean blacklist)
public boolean getAnalyzeTimestamp()
public void setAnalyzeTimestamp(boolean analyzeTimestamp)
public boolean getStatsPerHost()
public void setStatsPerHost(boolean statsPerHost)
public boolean getUseSparseRengeFilter()
public void setUseSparseRengeFilter(boolean useSparseRengeFilter)
public OriginHandling getOriginHandling()
public void setOriginHandling(OriginHandling originHandling)
public org.archive.modules.net.ServerCache getServerCache()
public void setServerCache(org.archive.modules.net.ServerCache serverCache)
public void afterPropertiesSet() throws Exception
afterPropertiesSet
in interface org.springframework.beans.factory.InitializingBean
Exception
protected boolean shouldProcess(org.archive.modules.CrawlURI curi)
shouldProcess
in class org.archive.modules.Processor
protected void innerProcess(org.archive.modules.CrawlURI puri)
innerProcess
in class org.archive.modules.Processor
protected org.archive.modules.ProcessResult innerProcessResult(org.archive.modules.CrawlURI curi) throws InterruptedException
innerProcessResult
in class org.archive.modules.Processor
InterruptedException
protected org.apache.lucene.document.Document lookupByURL(org.archive.modules.CrawlURI curi, is.landsbokasafn.deduplicator.Statistics currHostStats)
curi
- The CrawlURI to processcurrHostStats
- A statistics object for the current host.
If per host statistics tracking is enabled this
must be non null and the method will increment
appropriate counters on it.protected org.apache.lucene.document.Document lookupByDigest(org.archive.modules.CrawlURI curi, is.landsbokasafn.deduplicator.Statistics currHostStats)
curi
- The CrawlURI to processcurrHostStats
- A statistics object for the current host.
If per host statistics tracking is enabled this
must be non null and the method will increment
appropriate counters on it.public String report()
report
in class org.archive.modules.Processor
protected static String getPercentage(double portion, double total)
protected void doAnalysis(org.archive.modules.CrawlURI curi, is.landsbokasafn.deduplicator.Statistics currHostStats, boolean isDuplicate)
protected void doTimestampAnalysis(org.archive.modules.CrawlURI curi, org.apache.lucene.document.Document urlHit, is.landsbokasafn.deduplicator.Statistics currHostStats, boolean isDuplicate)
protected org.apache.lucene.search.Query queryField(String fieldName, String value)
fieldName
- name of the field to look in.value
- The value to query forprotected void finalTasks()
Copyright © 2014 National and University Library of Iceland. All Rights Reserved.