You are viewing a plain text version of this content. The canonical link for it is here.
Posted to server-dev@james.apache.org by ro...@apache.org on 2020/06/11 15:25:38 UTC

[james-project] 01/17: JAMES-3150 Add ScalaCheck for the garbadge collector

This is an automated email from the ASF dual-hosted git repository.

rouazana pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/james-project.git

commit 0769feee3aa2c89ca70bf0ccfccf5bb03d1409be
Author: Matthieu Baechler <ma...@apache.org>
AuthorDate: Thu Feb 27 11:24:20 2020 +0100

    JAMES-3150 Add ScalaCheck for the garbadge collector
---
 gc-properties.adoc                                 |  23 +++++
 server/blob/blob-deduplicating/pom.xml             | 106 ++++++++++++++++++++
 .../src/test/scala/GCPropertiesTest.scala          | 107 +++++++++++++++++++++
 server/blob/pom.xml                                |   1 +
 4 files changed, 237 insertions(+)

diff --git a/gc-properties.adoc b/gc-properties.adoc
new file mode 100644
index 0000000..7c69c01
--- /dev/null
+++ b/gc-properties.adoc
@@ -0,0 +1,23 @@
+= GC properties
+
+1. the execution time of the GC should be linked to
+active dataset but not to global dataset
+(for scalability purpose)
+
+2. GC should run on live dataset
+
+ 2.1. GC should not delete data being referenced by a pending process or
+still referenced
+
+ 2.2. GC should be idempotent: 2 concurrent or sequential runs should
+not have a different outcome than a single one
+
+3. GC should remove data from the underlying store
+
+ 3.1. an unreferenced piece of data should be removed after 1 day
+
+ 3.2. less than 10% of unreferenced data of a significant dataset
+should persist after three GC executions
+
+4. GC should report what it does
+
diff --git a/server/blob/blob-deduplicating/pom.xml b/server/blob/blob-deduplicating/pom.xml
new file mode 100644
index 0000000..e849535
--- /dev/null
+++ b/server/blob/blob-deduplicating/pom.xml
@@ -0,0 +1,106 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements. See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership. The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License. You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <artifactId>james-server-blob</artifactId>
+        <groupId>org.apache.james</groupId>
+        <version>3.5.0-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+
+    <artifactId>blob-deduplicating</artifactId>
+    <packaging>jar</packaging>
+
+    <name>Apache James :: Server :: Blob :: Deduplicating Blob Storage</name>
+    <description>
+        An implementation of BlobStore which deduplicate the stored blobs and use a garbage collector
+        to ensure their effective deletion.
+    </description>
+
+    <dependencies>
+        <dependency>
+            <groupId>${james.groupId}</groupId>
+            <artifactId>blob-api</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>${james.groupId}</groupId>
+            <artifactId>blob-api</artifactId>
+            <type>test-jar</type>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>${james.groupId}</groupId>
+            <artifactId>blob-memory</artifactId>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>${james.groupId}</groupId>
+            <artifactId>james-server-testing</artifactId>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>${james.groupId}</groupId>
+            <artifactId>james-server-util</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>${james.groupId}</groupId>
+            <artifactId>testing-base</artifactId>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.scala-lang</groupId>
+            <artifactId>scala-library</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.scala-lang.modules</groupId>
+            <artifactId>scala-java8-compat_${scala.base}</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.scalactic</groupId>
+            <artifactId>scalactic_2.13</artifactId>
+            <version>3.1.1</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.scalatest</groupId>
+            <artifactId>scalatest_2.13</artifactId>
+            <version>3.1.1</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.scalacheck</groupId>
+            <artifactId>scalacheck_2.13</artifactId>
+            <version>1.14.3</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>net.alchim31.maven</groupId>
+                <artifactId>scala-maven-plugin</artifactId>
+            </plugin>
+        </plugins>
+    </build>
+
+</project>
diff --git a/server/blob/blob-deduplicating/src/test/scala/GCPropertiesTest.scala b/server/blob/blob-deduplicating/src/test/scala/GCPropertiesTest.scala
new file mode 100644
index 0000000..5de3f44
--- /dev/null
+++ b/server/blob/blob-deduplicating/src/test/scala/GCPropertiesTest.scala
@@ -0,0 +1,107 @@
+import org.apache.james.blob.api.{BlobId, TestBlobId}
+import org.scalacheck.Gen
+import org.scalatest.funsuite.AnyFunSuite
+
+case class Generation(id: Long)
+case class Iteration(id: Long)
+case class ExternalID(id: String) // TODO
+
+sealed trait Event
+case class Reference(externalId: ExternalID, blobId: BlobId, generation: Generation) extends Event
+case class Deletion(generation: Generation, reference: Reference) extends Event
+
+case class Report(iteration: Iteration,
+                  blobsToDelete: Set[(Generation, BlobId)]
+                 )
+
+object Generators {
+
+  val smallInteger = Gen.choose(0L,100L)
+  var current = 0;
+  val generationsGen: Gen[LazyList[Generation]] = Gen.infiniteLazyList(Gen.frequency((90, Gen.const(0)), (9, Gen.const(1)), (1, Gen.const(2))))
+    .map(list => list.scanLeft(0)(_ + _))
+    .map(list => list.map(_.toLong).map(Generation.apply))
+
+  val iterationGen = smallInteger.map(Iteration.apply)
+
+  val blobIdFactory = new TestBlobId.Factory
+
+  def blobIdGen(generation: Generation) : Gen[BlobId] = Gen.uuid.map(uuid =>
+    blobIdFactory.from(s"${generation}_$uuid"))
+
+  val externalIDGen = Gen.uuid.map(uuid => ExternalID(uuid.toString))
+
+  def referenceGen(generation: Generation): Gen[Reference] = for {
+    blobId <- blobIdGen(generation)
+    externalId <- externalIDGen
+  } yield Reference(externalId, blobId, generation)
+
+  def existingReferences : Seq[Event] => Set[Reference] = _
+    .foldLeft((Set[Reference](), Set[Reference]()))((acc, event) => event match {
+      case deletion: Deletion => (acc._1 ++ Set(deletion.reference), acc._2)
+      case reference: Reference => if (acc._1.contains(reference)) {
+        acc
+      } else {
+        (acc._1, acc._2 ++ Set(reference))
+      }
+    })._2
+
+  def deletionGen(previousEvents : Seq[Event], generation: Generation): Gen[Option[Deletion]] = {
+    val persistingReferences = existingReferences(previousEvents)
+    if (persistingReferences.isEmpty) {
+      Gen.const(None)
+    } else {
+      Gen.oneOf(persistingReferences)
+        .map(reference => Deletion(generation, reference))
+        .map(Some(_))
+    }
+  }
+
+  def duplicateReferenceGen(generation: Generation, reference: Reference): Gen[Reference] = {
+    if (reference.generation == generation) {
+      externalIDGen.map(id => reference.copy(externalId = id))
+    } else {
+      referenceGen(generation)
+    }
+  }
+
+  def eventGen(previousEvents: Seq[Event], generation: Generation): Gen[Event] = for {
+    greenAddEvent <- referenceGen(generation)
+    addEvents = previousEvents.flatMap {
+      case x: Reference => Some(x)
+      case _ => None
+    }
+    randomAddEvent <- Gen.oneOf(addEvents)
+    duplicateAddEvent <- duplicateReferenceGen(generation, randomAddEvent)
+    deleteEvent <- deletionGen(previousEvents, generation)
+    event <- Gen.oneOf(Seq(greenAddEvent, duplicateAddEvent) ++ deleteEvent)
+  } yield event
+
+  def eventsGen() : Gen[Seq[Event]] = for {
+    nbEvents <- Gen.choose(0, 100)
+    generations <- generationsGen.map(_.take(nbEvents))
+    startEvent <- referenceGen(Generation.apply(0))
+    events <- foldM(generations, (Seq(startEvent): Seq[Event]))((previousEvents, generation) => eventGen(previousEvents, generation).map(_ +: previousEvents))
+  } yield events.reverse
+
+  def foldM[A, B](fa: LazyList[A], z: B)(f: (B, A) => Gen[B]): Gen[B] = {
+    def step(in: (LazyList[A], B)): Gen[Either[(LazyList[A], B), B]] = {
+      val (s, b) = in
+      if (s.isEmpty)
+        Gen.const(Right(b))
+      else {
+        f (b, s.head).map { bnext =>
+          Left((s.tail, bnext))
+        }
+      }
+    }
+
+    Gen.tailRecM((fa, z))(step)
+  }
+}
+
+class GCPropertiesTest extends AnyFunSuite {
+  test("print sample") {
+    Generators.eventsGen().sample.foreach(_.foreach(println))
+  }
+}
diff --git a/server/blob/pom.xml b/server/blob/pom.xml
index 4351178..9744471 100644
--- a/server/blob/pom.xml
+++ b/server/blob/pom.xml
@@ -34,6 +34,7 @@
 
     <modules>
         <module>blob-api</module>
+        <module>blob-deduplicating</module>
         <module>blob-cassandra</module>
         <module>blob-common</module>
         <module>blob-export-api</module>


---------------------------------------------------------------------
To unsubscribe, e-mail: server-dev-unsubscribe@james.apache.org
For additional commands, e-mail: server-dev-help@james.apache.org