From c97d0079ee0337c428643c5192fdb216d22793e6 Mon Sep 17 00:00:00 2001 From: Sridhar <51srgu1mst@hft-stuttgart.de> Date: Fri, 7 Nov 2025 15:18:35 +0000 Subject: [PATCH 1/4] user config --- user-config.json | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 user-config.json diff --git a/user-config.json b/user-config.json new file mode 100644 index 0000000..f02c4a2 --- /dev/null +++ b/user-config.json @@ -0,0 +1,12 @@ +{ + "platform": { + "url": "YOUR_PLATFORM_URL_HERE", + "username": "YOUR_USERNAME_HERE", + "tenant": "YOUR_TENANT_HERE", + "password": "YOUR_PASSWORD_HERE" + }, + "plugin_id": "dc27d504-4e79-46da-a11c-61487d5cc751", + "source_paths": [ + "output" + ] +} \ No newline at end of file -- GitLab From 2f9b81383733b815d96cea3432c58c21e5d68dd8 Mon Sep 17 00:00:00 2001 From: Sridhar <51srgu1mst@hft-stuttgart.de> Date: Fri, 7 Nov 2025 15:30:54 +0000 Subject: [PATCH 2/4] Update data_fetch.py --- data_fetch.py | 167 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 138 insertions(+), 29 deletions(-) diff --git a/data_fetch.py b/data_fetch.py index 5eb02a2..cfe7241 100644 --- a/data_fetch.py +++ b/data_fetch.py @@ -1,61 +1,170 @@ -import requests +import os import json - +import uuid +import requests from document import Document from document_writer import DocumentWriter + def fetch_issues(owner, repo): - '''Return a list of issues from OWNER/REPO. - ''' + '''Return a list of issues from OWNER/REPO.''' # GitHub API endpoint url = f"https://api.github.com/repos/{owner}/{repo}/issues" headers = { "Accept": "application/vnd.github+json", # "Authorization": "Bearer YOUR_GITHUB_TOKEN" # optional + "User-Agent": "BigData4Biz-Ingestion-Plugin/1.0" } - # Fetch issues - response = requests.get(url, headers=headers) - if response.status_code == 200: + + # Fetch issues with error handling + try: + response = requests.get(url, headers=headers, timeout=30) + response.raise_for_status() # Raises an HTTPError for bad status codes return response.json() - else: - raise f"Failed to fetch issues from {owner}/{repo}: status {response.status_code}" + except requests.exceptions.RequestException as e: + raise Exception(f"Failed to fetch issues from {owner}/{repo}: {e}") + def serialize_issue(issue, owner, repo): - '''Serializes an issue object from the Github API to a Document object. - ''' - id = issue.get("id") + '''Serializes an issue object from the Github API to a Document object.''' + issue_id = issue.get("id") + issue_number = issue.get("number") doc = Document() - doc.set_name(f"{owner}/{repo}/issues/{id}") - doc.set_title(issue.get("title")) + # Use issue number instead of ID for better readability + doc.set_name(f"{owner}/{repo}/issues/{issue_number}.txt") + doc.set_title(issue.get("title", "Untitled Issue")) + + # Enhanced attributes with user information + user_info = issue.get("user", {}) doc.set_attributes({ - "id": issue.get("id"), - "number": issue.get("number"), + "id": issue_id, + "number": issue_number, "state": issue.get("state"), "creation_date": issue.get("created_at"), "updated_at": issue.get("updated_at"), - #"user_login": issue["user"]["login"] if "user" in issue and issue["user"] else None, + "user_login": user_info.get("login"), + "user_id": user_info.get("id"), "comments": issue.get("comments"), - "html_url": issue.get("html_url") + "html_url": issue.get("html_url"), + "labels": [label.get("name") for label in issue.get("labels", [])], + "assignees": [assignee.get("login") for assignee in issue.get("assignees", [])] }) - doc.set_main_content(issue.get("body") or "") + + # Build comprehensive content + content_parts = [] + + # Issue header + content_parts.append(f"Issue #{issue_number}: {issue.get('title', '')}") + content_parts.append("=" * 50) + + # Basic info + content_parts.append(f"State: {issue.get('state', '')}") + content_parts.append(f"Created: {issue.get('created_at', '')}") + content_parts.append(f"Updated: {issue.get('updated_at', '')}") + content_parts.append(f"Author: {user_info.get('login', 'Unknown')}") + content_parts.append(f"Comments: {issue.get('comments', 0)}") + + # Labels + labels = [label.get("name") for label in issue.get("labels", [])] + if labels: + content_parts.append(f"Labels: {', '.join(labels)}") + + # Assignees + assignees = [assignee.get("login") for assignee in issue.get("assignees", [])] + if assignees: + content_parts.append(f"Assignees: {', '.join(assignees)}") + + # Body content + content_parts.append("\nDescription:") + content_parts.append("=" * 30) + content_parts.append(issue.get("body") or "No description provided.") + + # URL reference + content_parts.append(f"\nGitHub URL: {issue.get('html_url', '')}") + + doc.set_main_content("\n".join(content_parts)) return doc + +def create_user_config(platform_config, source_path, plugin_id): + """ + Create user-config.json file for BigData4Biz ingestion protocol. + + Args: + platform_config: Dictionary with platform configuration + source_path: Path to the data source directory + plugin_id: Unique UUID for the plugin + """ + config = { + "platform": platform_config, + "plugin_id": plugin_id, + "source_paths": [source_path] + } + + with open("user-config.json", "w", encoding='utf-8') as f: + json.dump(config, f, indent=2, ensure_ascii=False) + + print(f"Configuration file created: user-config.json") + print(f"Plugin ID: {plugin_id}") + + def main(): # Repository details owner = "EbookFoundation" repo = "free-programming-books" - issues = fetch_issues(owner, repo) - documents = [] - for issue in issues: - documents.append(serialize_issue(issue, owner, repo)) + # Platform configuration - UPDATE THESE WITH YOUR ACTUAL CREDENTIALS + platform_config = { + "url": "YOUR_PLATFORM_URL_HERE", # e.g., "https://your-platform.example.com" + "username": "YOUR_USERNAME_HERE", + "tenant": "YOUR_TENANT_HERE", + "password": "YOUR_PASSWORD_HERE" + } + + print(f"Fetching issues from {owner}/{repo}...") + + try: + # Fetch issues from GitHub + issues = fetch_issues(owner, repo) + documents = [] + + for issue in issues: + documents.append(serialize_issue(issue, owner, repo)) + + print(f"Collected {len(documents)} issues") + + # Initialize document writer + output_dir = "output" + writer = DocumentWriter(output_dir) + + # Write all documents + print("Writing documents and metadata...") + for doc in documents: + writer.write_document(doc) + + # Create configuration file + create_user_config(platform_config, output_dir, writer.get_plugin_id()) + + print(f"\nSuccessfully processed {len(documents)} issues") + print(f" Output directory: {output_dir}") + print(f" Plugin ID: {writer.get_plugin_id()}") + print(f"Configuration: user-config.json") + + # Show sample of what was processed + if documents: + print(f"\nSample issues processed:") + for i, doc in enumerate(documents[:3]): # Show first 3 + print(f" {i + 1}. #{doc.get_attributes().get('number')}: {doc.get_title()}") + if len(documents) > 3: + print(f" ... and {len(documents) - 3} more") + + except Exception as e: + print(f"Error: {e}") + return 1 - print(f"Collected {len(documents)} documents") + return 0 - writer = DocumentWriter("output") - for doc in documents: - writer.write_document(doc) -if __name__=="__main__": - main() +if __name__ == "__main__": + exit(main()) -- GitLab From c61144d36692d9e2f9f8c0cb798481138fd69455 Mon Sep 17 00:00:00 2001 From: Sridhar <51srgu1mst@hft-stuttgart.de> Date: Fri, 7 Nov 2025 15:32:50 +0000 Subject: [PATCH 3/4] meta data uuid --- document_writer.py | 102 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 95 insertions(+), 7 deletions(-) diff --git a/document_writer.py b/document_writer.py index a3b1ac6..7dbdf1a 100644 --- a/document_writer.py +++ b/document_writer.py @@ -1,24 +1,112 @@ import os import json +import uuid +from datetime import datetime from document import Document + class DocumentWriter(): def __init__(self, target_directory) -> None: self.target = target_directory + self.plugin_id = str(uuid.uuid4()) # Generate unique plugin ID os.makedirs(self.target, exist_ok=True) os.makedirs(os.path.join(self.target, ".meta"), exist_ok=True) def write_document(self, document: Document) -> None: + """ + Write the document to a file and create corresponding metadata file + with all required fields for BigData4Biz ingestion protocol. + """ main_file_name = os.path.join(self.target, document.get_name()) os.makedirs(os.path.dirname(main_file_name), exist_ok=True) - with open(main_file_name, "w") as f: + + # Write main content with UTF-8 encoding + with open(main_file_name, "w", encoding='utf-8') as f: f.write(document.get_main_content()) meta_file_name = os.path.join(self.target, ".meta", document.get_name() + ".json") os.makedirs(os.path.dirname(meta_file_name), exist_ok=True) - with open(meta_file_name, "w") as f: - f.write(json.dumps({ - "title": document.get_title(), - "backlink": main_file_name, - "properties": document.get_attributes() - })) + + # Get document attributes + attributes = document.get_attributes() + + # Extract and convert dates to milliseconds + creation_date = self._iso_to_milliseconds(attributes.get("creation_date")) + updated_date = self._iso_to_milliseconds(attributes.get("updated_at")) + + # Use updated_date for last_modify_date, fallback to creation_date + last_modify_date = updated_date if updated_date else creation_date + + # Get external_link from attributes (GitHub HTML URL) + external_link = attributes.get("html_url", "") + + # Create metadata with all required fields + metadata = { + "title": document.get_title(), + "backlink": main_file_name, + "properties": attributes, + # Required fields for BigData4Biz protocol + "external_link": external_link, + "creation_date": creation_date, + "last_modify_date": last_modify_date + } + + # Write metadata file with UTF-8 encoding + with open(meta_file_name, "w", encoding='utf-8') as f: + json.dump(metadata, f, ensure_ascii=False, indent=2) + + def _iso_to_milliseconds(self, iso_date_string): + """ + Convert ISO date string to milliseconds since epoch. + + Args: + iso_date_string: ISO format date string from GitHub API + + Returns: + int: Milliseconds since epoch, or current time if parsing fails + """ + if not iso_date_string: + return int(datetime.now().timestamp() * 1000) + + try: + # Handle both 'Z' suffix and timezone offsets + if iso_date_string.endswith('Z'): + iso_date_string = iso_date_string[:-1] + '+00:00' + + # Parse ISO format string to datetime object + dt = datetime.fromisoformat(iso_date_string) + + # Convert to milliseconds since epoch + return int(dt.timestamp() * 1000) + + except (ValueError, TypeError) as e: + print(f"Warning: Could not parse date '{iso_date_string}': {e}") + return int(datetime.now().timestamp() * 1000) + + def get_plugin_id(self): + """ + Get the generated plugin ID for configuration. + + Returns: + str: Unique UUID for this plugin instance + """ + return self.plugin_id + + def write_documents(self, documents): + """ + Write multiple documents to files. + + Args: + documents: List of Document objects to write + """ + for doc in documents: + self.write_document(doc) + + def get_target_directory(self): + """ + Get the target directory path. + + Returns: + str: Path to the target directory + """ + return self.target -- GitLab