package com.xebialabs.xlrelease.scheduler

import com.codahale.metrics.{InstrumentedExecutorService, MetricFilter}
import com.xebialabs.xlplatform.cluster.ClusterMode.Standalone
import com.xebialabs.xlrelease.actors.{ActorSystemHolder, ReleaseActorService}
import com.xebialabs.xlrelease.config.XlrConfig
import com.xebialabs.xlrelease.metrics.XlrMetricRegistry
import com.xebialabs.xlrelease.repository.JobRunnerRepository
import com.xebialabs.xlrelease.scheduler.service.JobService
import com.xebialabs.xlrelease.scheduler.workers.CompositeWorker
import com.xebialabs.xlrelease.user.User
import com.xebialabs.xlrelease.utils.PrefixedThreadFactory
import org.springframework.context.annotation.Lazy
import org.springframework.stereotype.Service

import java.util.concurrent.atomic.AtomicBoolean
import java.util.concurrent.{ExecutorService, Executors, TimeUnit}
import scala.collection.mutable

@Service
class WorkManager(val xlrConfig: XlrConfig,
                  val jobQueue: JobQueue,
                  val compositeWorker: CompositeWorker,
                  val jobService: JobService,
                  val jobRunnerRepository: JobRunnerRepository,
                  val releaseActorService: ReleaseActorService,
                  // actorSystemHolder depends actorServiceHolder which depends on ExecutionService that depends on WorkManager which depends on
                  // actorSystemHolder - so this cycle needs to be broken somehow - I choose @Lazy
                  @Lazy
                  val actorSystemHolder: ActorSystemHolder
                 ) extends JobRecoveryLogic {
  private lazy val maxThreadsCount: Int = xlrConfig.executors.scheduler.maxThreadsCount
  private var threadPool: ExecutorService = _
  private val shutdownInProgress: AtomicBoolean = new AtomicBoolean(false)
  private val workerThreads: mutable.Buffer[WorkerThread] = mutable.Buffer()
  private val TASK_EXECUTION = "taskExecution"

  def start() = {
    // when work manager is started it needs to make sure local jobs queue is not polluted so 1st call stop
    // also, it after we initialize worker threads we have to re-submit/recover previously submitted persisted
    // local jobs so that queued jobs, abort scripts or scheduled jobs would resume
    logger.info(s"Starting work manager with ${maxThreadsCount} threads")
    stop()
    // make sure that local job queue is cleaned up before we start
    jobQueue.cancelIf(_ => true)
    jobQueue.start()
    if (xlrConfig.maintenanceModeEnabled) {
      logger.debug("[MAINTENANCE MODE] Job recovery - DISABLED")
    } else {
      recoverJobs()
    }
    startThreads()
    logger.info("Started work manager")
  }

  def recoverJobs(): Unit = {
    // this is 1st entry point executed only once when node is started
    try {
      logger.debug("Job recovery started")
      xlrConfig.cluster.mode match {
        case Standalone =>
          // special case if server was previously started in clustered mode we have to recover all jobs, including those from other nodes
          recoverAllJobs()
        case _ =>
          recoverJobs(nodeId)
          // jobs with nodeId == null are created by upgraders
          // reason why nodeId is null: at the creation time we don't know nodeId yet
          recoverJobs(null)
      }
      logger.debug("Job recovery completed, proceeding with startup")
    } catch {
      case t: Throwable =>
        logger.error("Job recovery failed", t)
    }
  }

  private def startThreads(): Unit = {
    shutdownInProgress.set(false)
    threadPool = getThreadPool
    workerThreads.clear()
    for (_ <- 0 until maxThreadsCount) {
      val workerThread = new WorkerThread(jobQueue, compositeWorker)
      workerThreads += workerThread
      threadPool.execute(workerThread)
    }
  }

  private def stop() = {
    for (worker <- workerThreads) {
      worker.stop()
    }
    workerThreads.clear()
    jobQueue.stop()
  }


  private def getThreadPool: ExecutorService = {
    val executors: ExecutorService = Executors.newFixedThreadPool(maxThreadsCount, PrefixedThreadFactory("task-execution"))
    if (xlrConfig.metrics.enabled) {
      // InstrumentedExecutorService will register metrics on every instance creation and metricRegistry doesn't support that
      val registry = XlrMetricRegistry.metricRegistry
      val filter = MetricFilter.startsWith(s"$TASK_EXECUTION.")
      registry.removeMatching(filter)

      new InstrumentedExecutorService(executors, registry, TASK_EXECUTION)
    } else {
      executors
    }
  }

  def shutdown(): Unit = {
    if (shutdownInProgress.compareAndSet(false, true)) {
      logger.info("starting shutdown")
      stop()
      threadPool.shutdown()
      if (!threadPool.awaitTermination(xlrConfig.taskSchedulerGraceShutdownPeriod, TimeUnit.SECONDS)) {
        logger.warn("There are tasks, that did not complete within shutdown grace period, going to interrupt those tasks")
        threadPool.shutdownNow()
        if (!threadPool.awaitTermination(xlrConfig.taskSchedulerGraceShutdownPeriod, TimeUnit.SECONDS)) {
          logger.error("There are tasks, that could not be interrupted, giving up")
        }
      }
      // cleanup local job queue only once we gave enough time to worker threads to consume stop msgs
      jobQueue.cancelIf(_ => true)
      shutdownInProgress.set(false)
    } else {
      logger.info("skipping shutdown, because shutdown already happened or in progress")
    }
  }

  def submit(job: Job): Unit = {
    jobQueue.submit(job)
  }

  def replace(job: Job): Unit = {
    jobQueue.replace(job)
  }

  def abortJobByTaskId(taskId: String): Unit = {
    logger.debug(s"Aborting scheduled job $taskId")
    requestRunnerToAbortTask(taskId)
    jobService.deleteByTaskId(taskId)
    jobQueue.cancelIf {
      case job: TaskJob[_] => job.taskId.equals(taskId)
      case _ => false
    }
  }

  private def requestRunnerToAbortTask(taskId: String): Unit = {
    jobService.findByTaskId(taskId).foreach { jobRow =>
      if (jobRow.jobType == JobType.CONTAINER_TASK) {
        if (Seq(JobStatus.RESERVED, JobStatus.RUNNING).contains(jobRow.status) && null != jobRow.runnerId) {
          jobRunnerRepository.findRunner(jobRow.runnerId) match {
            case Some(runner) => runner.abortJob(jobRow.id)
            case None => logger.error(s"Can't find runner [${jobRow.runnerId}]. Can't request runner to abort the job [${jobRow.id}]")
          }
          if (jobRow.status == JobStatus.RESERVED) {
            releaseActorService.failTaskAsync(taskId, "Task was aborted", User.SYSTEM, None)
          }
        } else if (jobRow.status == JobStatus.QUEUED) {
          releaseActorService.failTaskAsync(taskId, "Task was aborted", User.SYSTEM, None)
        }
      }
    }
  }

}
