public class NutchJob extends Job
Job
for Nutch jobs.Job.JobState, Job.TaskStatusFilter
COMPLETION_POLL_INTERVAL_KEY, OUTPUT_FILTER, PROGRESS_MONITOR_POLL_INTERVAL_KEY, SUBMIT_REPLICATION, USED_GENERIC_PARSER
conf, credentials, ugi
APPLICATION_ATTEMPT_ID, APPLICATION_MASTER_CLASS, CACHE_ARCHIVES, CACHE_ARCHIVES_SIZES, CACHE_ARCHIVES_TIMESTAMPS, CACHE_ARCHIVES_VISIBILITIES, CACHE_FILE_TIMESTAMPS, CACHE_FILE_VISIBILITIES, CACHE_FILES, CACHE_FILES_SIZES, CACHE_LOCALARCHIVES, CACHE_LOCALFILES, CACHE_SYMLINK, CLASSPATH_ARCHIVES, CLASSPATH_FILES, COMBINE_CLASS_ATTR, COMBINE_RECORDS_BEFORE_PROGRESS, COMBINER_GROUP_COMPARATOR_CLASS, COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, COUNTER_GROUP_NAME_MAX_DEFAULT, COUNTER_GROUP_NAME_MAX_KEY, COUNTER_GROUPS_MAX_DEFAULT, COUNTER_GROUPS_MAX_KEY, COUNTER_NAME_MAX_DEFAULT, COUNTER_NAME_MAX_KEY, COUNTERS_MAX_DEFAULT, COUNTERS_MAX_KEY, DEFAULT_JOB_ACL_MODIFY_JOB, DEFAULT_JOB_ACL_VIEW_JOB, DEFAULT_JOB_AM_ACCESS_DISABLED, DEFAULT_JOB_TOKEN_TRACKING_IDS_ENABLED, DEFAULT_LOG_LEVEL, DEFAULT_MAP_CPU_VCORES, DEFAULT_MAP_MEMORY_MB, DEFAULT_MAPRED_ADMIN_JAVA_OPTS, DEFAULT_MAPRED_ADMIN_USER_ENV, DEFAULT_MAPREDUCE_APPLICATION_CLASSPATH, DEFAULT_MAPREDUCE_CROSS_PLATFORM_APPLICATION_CLASSPATH, DEFAULT_MAX_SHUFFLE_FETCH_RETRY_DELAY, DEFAULT_MR_AM_ADMIN_COMMAND_OPTS, DEFAULT_MR_AM_COMMAND_OPTS, DEFAULT_MR_AM_COMMIT_WINDOW_MS, DEFAULT_MR_AM_COMMITTER_CANCEL_TIMEOUT_MS, DEFAULT_MR_AM_CONTAINERLAUNCHER_THREAD_COUNT_LIMIT, DEFAULT_MR_AM_CPU_VCORES, DEFAULT_MR_AM_HISTORY_COMPLETE_EVENT_FLUSH_TIMEOUT_MS, DEFAULT_MR_AM_HISTORY_JOB_COMPLETE_UNFLUSHED_MULTIPLIER, DEFAULT_MR_AM_HISTORY_MAX_UNFLUSHED_COMPLETE_EVENTS, DEFAULT_MR_AM_HISTORY_USE_BATCHED_FLUSH_QUEUE_SIZE_THRESHOLD, DEFAULT_MR_AM_IGNORE_BLACKLISTING_BLACKLISTED_NODE_PERCENT, DEFAULT_MR_AM_JOB_CLIENT_THREAD_COUNT, DEFAULT_MR_AM_JOB_REDUCE_PREEMPTION_LIMIT, DEFAULT_MR_AM_JOB_REDUCE_RAMP_UP_LIMIT, DEFAULT_MR_AM_LOG_BACKUPS, DEFAULT_MR_AM_LOG_KB, DEFAULT_MR_AM_LOG_LEVEL, DEFAULT_MR_AM_MAX_ATTEMPTS, DEFAULT_MR_AM_NUM_PROGRESS_SPLITS, DEFAULT_MR_AM_STAGING_DIR, DEFAULT_MR_AM_TASK_ESTIMATOR_SMOOTH_LAMBDA_MS, DEFAULT_MR_AM_TASK_LISTENER_THREAD_COUNT, DEFAULT_MR_AM_TO_RM_HEARTBEAT_INTERVAL_MS, DEFAULT_MR_AM_TO_RM_WAIT_INTERVAL_MS, DEFAULT_MR_AM_VMEM_MB, DEFAULT_MR_CLIENT_MAX_RETRIES, DEFAULT_MR_CLIENT_TO_AM_IPC_MAX_RETRIES, DEFAULT_MR_CLIENT_TO_AM_IPC_MAX_RETRIES_ON_TIMEOUTS, DEFAULT_MR_JOB_END_NOTIFICATION_TIMEOUT, DEFAULT_MR_JOB_REDUCER_PREEMPT_DELAY_SEC, DEFAULT_REDUCE_CPU_VCORES, DEFAULT_REDUCE_MEMORY_MB, DEFAULT_SHELL, DEFAULT_SPLIT_METAINFO_MAXSIZE, DEFAULT_TASK_LOG_BACKUPS, GROUP_COMPARATOR_CLASS, HADOOP_WORK_DIR, ID, INDEX_CACHE_MEMORY_LIMIT, INPUT_FORMAT_CLASS_ATTR, IO_SORT_FACTOR, IO_SORT_MB, JAR, JAR_UNPACK_PATTERN, JOB_ACL_MODIFY_JOB, JOB_ACL_VIEW_JOB, JOB_AM_ACCESS_DISABLED, JOB_CANCEL_DELEGATION_TOKEN, JOB_CONF_FILE, JOB_JAR, JOB_JOBTRACKER_ID, JOB_LOCAL_DIR, JOB_NAME, JOB_NAMENODES, JOB_SPLIT, JOB_SPLIT_METAINFO, JOB_SUBMIT_DIR, JOB_SUBMITHOST, JOB_SUBMITHOSTADDR, JOB_TAGS, JOB_TOKEN_TRACKING_IDS, JOB_TOKEN_TRACKING_IDS_ENABLED, JOB_UBERTASK_ENABLE, JOB_UBERTASK_MAXBYTES, JOB_UBERTASK_MAXMAPS, JOB_UBERTASK_MAXREDUCES, JVM_NUMTASKS_TORUN, KEY_COMPARATOR, MAP_CLASS_ATTR, MAP_COMBINE_MIN_SPILLS, MAP_CPU_VCORES, MAP_DEBUG_SCRIPT, MAP_ENV, MAP_FAILURES_MAX_PERCENT, MAP_INPUT_FILE, MAP_INPUT_PATH, MAP_INPUT_START, MAP_JAVA_OPTS, MAP_LOG_LEVEL, MAP_MAX_ATTEMPTS, MAP_MEMORY_MB, MAP_OUTPUT_COLLECTOR_CLASS_ATTR, MAP_OUTPUT_COMPRESS, MAP_OUTPUT_COMPRESS_CODEC, MAP_OUTPUT_KEY_CLASS, MAP_OUTPUT_KEY_FIELD_SEPERATOR, MAP_OUTPUT_VALUE_CLASS, MAP_SKIP_INCR_PROC_COUNT, MAP_SKIP_MAX_RECORDS, MAP_SORT_SPILL_PERCENT, MAP_SPECULATIVE, MAPRED_ADMIN_USER_ENV, MAPRED_ADMIN_USER_SHELL, MAPRED_MAP_ADMIN_JAVA_OPTS, MAPRED_REDUCE_ADMIN_JAVA_OPTS, MAPREDUCE_APPLICATION_CLASSPATH, MAPREDUCE_APPLICATION_FRAMEWORK_PATH, MAPREDUCE_JOB_CLASSLOADER, MAPREDUCE_JOB_CLASSLOADER_SYSTEM_CLASSES, MAPREDUCE_JOB_CREDENTIALS_BINARY, MAPREDUCE_JOB_DIR, MAPREDUCE_JOB_SHUFFLE_PROVIDER_SERVICES, MAPREDUCE_JOB_USER_CLASSPATH_FIRST, MAPREDUCE_V2_CHILD_CLASS, MAX_SHUFFLE_FETCH_RETRY_DELAY, MAX_TASK_FAILURES_PER_TRACKER, MR_AM_ADMIN_COMMAND_OPTS, MR_AM_ADMIN_USER_ENV, MR_AM_COMMAND_OPTS, MR_AM_COMMIT_WINDOW_MS, MR_AM_COMMITTER_CANCEL_TIMEOUT_MS, MR_AM_CONTAINERLAUNCHER_THREAD_COUNT_LIMIT, MR_AM_CPU_VCORES, MR_AM_CREATE_JH_INTERMEDIATE_BASE_DIR, MR_AM_ENV, MR_AM_HISTORY_COMPLETE_EVENT_FLUSH_TIMEOUT_MS, MR_AM_HISTORY_JOB_COMPLETE_UNFLUSHED_MULTIPLIER, MR_AM_HISTORY_MAX_UNFLUSHED_COMPLETE_EVENTS, MR_AM_HISTORY_USE_BATCHED_FLUSH_QUEUE_SIZE_THRESHOLD, MR_AM_IGNORE_BLACKLISTING_BLACKLISTED_NODE_PERECENT, MR_AM_JOB_CLIENT_PORT_RANGE, MR_AM_JOB_CLIENT_THREAD_COUNT, MR_AM_JOB_NODE_BLACKLISTING_ENABLE, MR_AM_JOB_RECOVERY_ENABLE, MR_AM_JOB_RECOVERY_ENABLE_DEFAULT, MR_AM_JOB_REDUCE_PREEMPTION_LIMIT, MR_AM_JOB_REDUCE_RAMPUP_UP_LIMIT, MR_AM_JOB_SPECULATOR, MR_AM_LOG_BACKUPS, MR_AM_LOG_KB, MR_AM_LOG_LEVEL, MR_AM_MAX_ATTEMPTS, MR_AM_NUM_PROGRESS_SPLITS, MR_AM_PREFIX, MR_AM_SECURITY_SERVICE_AUTHORIZATION_CLIENT, MR_AM_SECURITY_SERVICE_AUTHORIZATION_TASK_UMBILICAL, MR_AM_STAGING_DIR, MR_AM_TASK_ESTIMATOR, MR_AM_TASK_ESTIMATOR_EXPONENTIAL_RATE_ENABLE, MR_AM_TASK_ESTIMATOR_SMOOTH_LAMBDA_MS, MR_AM_TASK_LISTENER_THREAD_COUNT, MR_AM_TO_RM_HEARTBEAT_INTERVAL_MS, MR_AM_TO_RM_WAIT_INTERVAL_MS, MR_AM_VMEM_MB, MR_APPLICATION_TYPE, MR_CLIENT_MAX_RETRIES, MR_CLIENT_TO_AM_IPC_MAX_RETRIES, MR_CLIENT_TO_AM_IPC_MAX_RETRIES_ON_TIMEOUTS, MR_JOB_END_NOTIFICATION_MAX_ATTEMPTS, MR_JOB_END_NOTIFICATION_MAX_RETRY_INTERVAL, MR_JOB_END_NOTIFICATION_PROXY, MR_JOB_END_NOTIFICATION_TIMEOUT, MR_JOB_END_NOTIFICATION_URL, MR_JOB_END_RETRY_ATTEMPTS, MR_JOB_END_RETRY_INTERVAL, MR_JOB_REDUCER_PREEMPT_DELAY_SEC, MR_PREFIX, NUM_MAP_PROFILES, NUM_MAPS, NUM_REDUCE_PROFILES, NUM_REDUCES, OUTPUT, OUTPUT_FORMAT_CLASS_ATTR, OUTPUT_KEY_CLASS, OUTPUT_VALUE_CLASS, PARTITIONER_CLASS_ATTR, PRESERVE_FAILED_TASK_FILES, PRESERVE_FILES_PATTERN, PRIORITY, QUEUE_NAME, RECORDS_BEFORE_PROGRESS, REDUCE_CLASS_ATTR, REDUCE_CPU_VCORES, REDUCE_DEBUG_SCRIPT, REDUCE_ENV, REDUCE_FAILURES_MAXPERCENT, REDUCE_INPUT_BUFFER_PERCENT, REDUCE_JAVA_OPTS, REDUCE_LOG_LEVEL, REDUCE_MARKRESET_BUFFER_PERCENT, REDUCE_MARKRESET_BUFFER_SIZE, REDUCE_MAX_ATTEMPTS, REDUCE_MEMORY_MB, REDUCE_MEMORY_TOTAL_BYTES, REDUCE_MEMTOMEM_ENABLED, REDUCE_MEMTOMEM_THRESHOLD, REDUCE_MERGE_INMEM_THRESHOLD, REDUCE_SKIP_INCR_PROC_COUNT, REDUCE_SKIP_MAXGROUPS, REDUCE_SPECULATIVE, SETUP_CLEANUP_NEEDED, SHUFFLE_CONNECT_TIMEOUT, SHUFFLE_FETCH_FAILURES, SHUFFLE_INPUT_BUFFER_PERCENT, SHUFFLE_MEMORY_LIMIT_PERCENT, SHUFFLE_MERGE_PERCENT, SHUFFLE_NOTIFY_READERROR, SHUFFLE_PARALLEL_COPIES, SHUFFLE_READ_TIMEOUT, SKIP_OUTDIR, SKIP_RECORDS, SKIP_START_ATTEMPTS, SPECULATIVE_SLOWNODE_THRESHOLD, SPECULATIVE_SLOWTASK_THRESHOLD, SPECULATIVECAP, SPLIT_FILE, SPLIT_METAINFO_MAXSIZE, STDERR_LOGFILE_ENV, STDOUT_LOGFILE_ENV, TASK_ATTEMPT_ID, TASK_CLEANUP_NEEDED, TASK_DEBUGOUT_LINES, TASK_ID, TASK_ISMAP, TASK_LOG_BACKUPS, TASK_MAP_PROFILE_PARAMS, TASK_OUTPUT_DIR, TASK_PARTITION, TASK_PROFILE, TASK_PROFILE_PARAMS, TASK_REDUCE_PROFILE_PARAMS, TASK_TEMP_DIR, TASK_TIMEOUT, TASK_TIMEOUT_CHECK_INTERVAL_MS, TASK_USERLOG_LIMIT, USER_LOG_RETAIN_HOURS, USER_NAME, WORKDIR, WORKFLOW_ADJACENCY_PREFIX_PATTERN, WORKFLOW_ADJACENCY_PREFIX_STRING, WORKFLOW_ID, WORKFLOW_NAME, WORKFLOW_NODE_NAME, WORKFLOW_TAGS, WORKING_DIR
Constructor and Description |
---|
NutchJob(Configuration conf)
Deprecated.
|
NutchJob(Configuration conf,
String jobName)
Deprecated.
|
Modifier and Type | Method and Description |
---|---|
static NutchJob |
getInstance(Configuration conf)
|
static NutchJob |
getInstance(Configuration conf,
String jobName)
|
static boolean |
shouldProcess(CharSequence mark,
org.apache.avro.util.Utf8 batchId) |
boolean |
waitForCompletion(boolean verbose) |
addArchiveToClassPath, addCacheArchive, addCacheFile, addFileToClassPath, cleanupProgress, createSymlink, failTask, getCluster, getCompletionPollInterval, getCounters, getFinishTime, getHistoryUrl, getInstance, getInstance, getInstance, getInstance, getInstance, getJobFile, getJobName, getJobState, getJobSubmitter, getPriority, getProgressPollInterval, getSchedulingInfo, getStartTime, getStatus, getTaskCompletionEvents, getTaskCompletionEvents, getTaskDiagnostics, getTaskOutputFilter, getTaskReports, getTrackingURL, isComplete, isRetired, isSuccessful, isUber, killJob, killTask, killTask, mapProgress, monitorAndPrintJob, reduceProgress, setCacheArchives, setCacheFiles, setCancelDelegationTokenUponJobCompletion, setCombinerClass, setCombinerKeyGroupingComparatorClass, setGroupingComparatorClass, setInputFormatClass, setJar, setJarByClass, setJobName, setJobSetupCleanupNeeded, setMapOutputKeyClass, setMapOutputValueClass, setMapperClass, setMapSpeculativeExecution, setMaxMapAttempts, setMaxReduceAttempts, setNumReduceTasks, setOutputFormatClass, setOutputKeyClass, setOutputValueClass, setPartitionerClass, setPriority, setProfileEnabled, setProfileParams, setProfileTaskRange, setReducerClass, setReduceSpeculativeExecution, setSortComparatorClass, setSpeculativeExecution, setTaskOutputFilter, setupProgress, setUser, setWorkingDirectory, submit, toString
getArchiveClassPaths, getArchiveTimestamps, getCacheArchives, getCacheFiles, getCombinerClass, getCombinerKeyGroupingComparator, getConfiguration, getCredentials, getFileClassPaths, getFileTimestamps, getGroupingComparator, getInputFormatClass, getJar, getJobID, getJobSetupCleanupNeeded, getLocalCacheArchives, getLocalCacheFiles, getMapOutputKeyClass, getMapOutputValueClass, getMapperClass, getMaxMapAttempts, getMaxReduceAttempts, getNumReduceTasks, getOutputFormatClass, getOutputKeyClass, getOutputValueClass, getPartitionerClass, getProfileEnabled, getProfileParams, getProfileTaskRange, getReducerClass, getSortComparator, getSymlink, getTaskCleanupNeeded, getUser, getWorkingDirectory, setJobID
clone, equals, finalize, getClass, hashCode, notify, notifyAll, wait, wait, wait
getArchiveClassPaths, getArchiveTimestamps, getCacheArchives, getCacheFiles, getCombinerClass, getCombinerKeyGroupingComparator, getConfiguration, getCredentials, getFileClassPaths, getFileTimestamps, getGroupingComparator, getInputFormatClass, getJar, getJobID, getJobSetupCleanupNeeded, getLocalCacheArchives, getLocalCacheFiles, getMapOutputKeyClass, getMapOutputValueClass, getMapperClass, getMaxMapAttempts, getMaxReduceAttempts, getNumReduceTasks, getOutputFormatClass, getOutputKeyClass, getOutputValueClass, getPartitionerClass, getProfileEnabled, getProfileParams, getProfileTaskRange, getReducerClass, getSortComparator, getSymlink, getTaskCleanupNeeded, getUser, getWorkingDirectory
@Deprecated public NutchJob(Configuration conf) throws IOException
getInstance(Configuration)
conf
- IOException
@Deprecated public NutchJob(Configuration conf, String jobName) throws IOException
getInstance(Configuration, String)
conf
- jobName
- IOException
public static NutchJob getInstance(Configuration conf) throws IOException
NutchJob
with no particular Cluster
and a
given Configuration
.
The NutchJob
makes a copy of the Configuration
so
that any necessary internal modifications do not reflect on the incoming
parameter.
A Cluster will be created from the conf parameter only when it's needed.
This code heavily mimics that of Hadoop's.conf
- the configurationNutchJob
, with no connection to a cluster yet.IOException
public static NutchJob getInstance(Configuration conf, String jobName) throws IOException
NutchJob
with no particular Cluster
and a given jobName.
A Cluster will be created from the conf parameter only when it's needed.
The NutchJob
makes a copy of the Configuration
so
that any necessary internal modifications do not reflect on the incoming
parameter.conf
- the configurationjobName
- the name given to this NutchJobNutchJob
, with no connection to a cluster yet.IOException
public boolean waitForCompletion(boolean verbose) throws IOException, InterruptedException, ClassNotFoundException
waitForCompletion
in class Job
IOException
InterruptedException
ClassNotFoundException
public static boolean shouldProcess(CharSequence mark, org.apache.avro.util.Utf8 batchId)
Copyright © 2015 The Apache Software Foundation