You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@devlake.apache.org by "klesh (via GitHub)" <gi...@apache.org> on 2023/02/21 14:36:33 UTC

[GitHub] [incubator-devlake] klesh opened a new pull request, #4478: feat: finalizable collector helper / github pr support timeFilter/dif…

klesh opened a new pull request, #4478:
URL: https://github.com/apache/incubator-devlake/pull/4478

   ### Summary
   
   1. added NewStatefulApiCollectorForFinalizableEntity to help collecting data from APIs that do NOT support filtering data by `updated`
   2. github restful pr collector support timeFilter/diffSync
   
   
   ### Does this close any open issues?
   Part of #4403 
   
   
   ### Self test
   
   Collected `apache/incubator-devlake` PR with  the following command:
   ```
   go run ./plugins/github/github.go -c 1 -o apache -r incubator-devlake -a 2023-02-16T00:00:00Z -t collectApiPullRequests,extractApiPullRequests
   ```
   and limited records(only the first page) appeared in the database.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@devlake.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [incubator-devlake] mindlesscloud merged pull request #4478: feat: finalizable collector helper / github pr support timeFilter/dif…

Posted by "mindlesscloud (via GitHub)" <gi...@apache.org>.
mindlesscloud merged PR #4478:
URL: https://github.com/apache/incubator-devlake/pull/4478


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@devlake.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [incubator-devlake] mindlesscloud commented on a diff in pull request #4478: feat: finalizable collector helper / github pr support timeFilter/dif…

Posted by "mindlesscloud (via GitHub)" <gi...@apache.org>.
mindlesscloud commented on code in PR #4478:
URL: https://github.com/apache/incubator-devlake/pull/4478#discussion_r1113781971


##########
backend/helpers/pluginhelper/api/api_collector_with_state.go:
##########
@@ -18,18 +18,24 @@ limitations under the License.
 package api
 
 import (
+	"encoding/json"
+	"net/http"
+	"net/url"
 	"time"
 
 	"github.com/apache/incubator-devlake/core/dal"
 	"github.com/apache/incubator-devlake/core/errors"
 	"github.com/apache/incubator-devlake/core/models"
+	plugin "github.com/apache/incubator-devlake/core/plugin"

Review Comment:
   It seems the import alias is redundant



##########
backend/helpers/pluginhelper/api/api_collector_with_state.go:
##########
@@ -94,44 +100,202 @@ func (m *ApiCollectorStateManager) IsIncremental() bool {
 }
 
 // InitCollector init the embedded collector
-func (m *ApiCollectorStateManager) InitCollector(args ApiCollectorArgs) (err errors.Error) {
+func (m *ApiCollectorStateManager) InitCollector(args ApiCollectorArgs) errors.Error {
 	args.RawDataSubTaskArgs = m.RawDataSubTaskArgs
-	m.ApiCollector, err = NewApiCollector(args)
-	return err
+	apiCollector, err := NewApiCollector(args)
+	if err != nil {
+		return err
+	}
+	m.subtasks = append(m.subtasks, apiCollector)
+	return nil
 }
 
 // InitGraphQLCollector init the embedded collector
-func (m *ApiCollectorStateManager) InitGraphQLCollector(args GraphqlCollectorArgs) (err errors.Error) {
+func (m *ApiCollectorStateManager) InitGraphQLCollector(args GraphqlCollectorArgs) errors.Error {
 	args.RawDataSubTaskArgs = m.RawDataSubTaskArgs
-	m.GraphqlCollector, err = NewGraphqlCollector(args)
-	return err
-}
-
-// Execute the embedded collector and record execute state
-func (m ApiCollectorStateManager) Execute() errors.Error {
-	err := m.ApiCollector.Execute()
+	graphqlCollector, err := NewGraphqlCollector(args)
 	if err != nil {
 		return err
 	}
-
-	return m.updateState()
+	m.subtasks = append(m.subtasks, graphqlCollector)
+	return nil
 }
 
-// ExecuteGraphQL the embedded collector and record execute state
-func (m ApiCollectorStateManager) ExecuteGraphQL() errors.Error {
-	err := m.GraphqlCollector.Execute()
-	if err != nil {
-		return err
+// Execute the embedded collector and record execute state
+func (m *ApiCollectorStateManager) Execute() errors.Error {
+	for _, subtask := range m.subtasks {
+		err := subtask.Execute()
+		if err != nil {
+			return err
+		}
 	}
 
-	return m.updateState()
-}
-
-func (m ApiCollectorStateManager) updateState() errors.Error {
 	db := m.Ctx.GetDal()
 	m.LatestState.LatestSuccessStart = &m.ExecuteStart
 	// Deprecating(timeAfter): to be deleted
 	m.LatestState.CreatedDateAfter = m.CreatedDateAfter
 	m.LatestState.TimeAfter = m.TimeAfter
 	return db.CreateOrUpdate(&m.LatestState)
 }
+
+// NewStatefulApiCollectorForFinalizableEntity aims to add timeFilter/diffSync support for
+// APIs that do NOT support filtering data by updated date. However, it comes with the
+// following constraints:
+//  1. The entity is a short-lived object or it is likely to be irrelevant
+//     a. ci/id pipelines are short-lived objects
+//     b. pull request might took a year to be closed or never, but it is likely irrelevant
+//  2. The entity must be Finalizable: when it is finalized, no modification forever
+//  3. The API must fit one of the following traits:
+//     a. it supports filtering by Created Date, in this case, you may specify the `GetTotalPages`
+//     option to fetch data with Determined Strategy if possible.
+//     b. or sorting by Created Date in Descending order, this this case, you must use `Concurrency`

Review Comment:
   `this this case`->`in this case`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@devlake.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org