diff --git a/CHANGELOG.md b/CHANGELOG.md index 75eebb2..669279b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +v2.3.0 +====== + + * Updated coverage file parsing to use coverage.py API instead of direct file parsing + * Added coverage>=5.0.0 as a core dependency + v2.2.1 ====== diff --git a/gitpandas/repository.py b/gitpandas/repository.py index 8e27da7..5f96a43 100644 --- a/gitpandas/repository.py +++ b/gitpandas/repository.py @@ -19,7 +19,7 @@ import numpy as np import pandas as pd -from git import GitCommandError, Repo +from git import BadName, BadObject, GitCommandError, Repo from pandas import DataFrame, to_datetime from gitpandas.cache import multicache @@ -207,24 +207,13 @@ def has_coverage(self): bool: True if a valid .coverage file exists, False otherwise """ - if os.path.exists(self.git_dir + os.sep + ".coverage"): - try: - with open(self.git_dir + os.sep + ".coverage") as f: - blob = f.read() - blob = blob.split("!")[2] - json.loads(blob) - return True - except Exception as e: - logger.warning(f"Could not parse .coverage file: {e}", exc_info=True) - return False - else: - return False + return os.path.exists(self.git_dir + os.sep + ".coverage") def coverage(self): """Analyzes test coverage information from the repository. - Attempts to parse the .coverage file if it exists and returns coverage - statistics for each file. + Attempts to read and parse the .coverage file in the repository root + using the coverage.py API. Returns coverage statistics for each file. Returns: pandas.DataFrame: A DataFrame with columns: @@ -236,41 +225,41 @@ def coverage(self): Additional columns for any labels specified in labels_to_add Note: - Returns an empty DataFrame with the correct columns if no coverage - file exists or it can't be parsed. + Returns an empty DataFrame if no coverage data exists or can't be read. """ - if not self.has_coverage(): return DataFrame(columns=["filename", "lines_covered", "total_lines", "coverage"]) - with open(self.git_dir + os.sep + ".coverage") as f: - blob = f.read() - blob = blob.split("!")[2] - cov = json.loads(blob) + try: + import coverage - ds = [] - for filename in cov["lines"]: - _idx = 0 - try: - with open(filename) as f: - for _idx, _ in enumerate(f): - pass - except FileNotFoundError: - logger.warning(f"Could not find file {filename} for coverage analysis.") + cov = coverage.Coverage(data_file=os.path.join(self.git_dir, ".coverage")) + cov.load() + data = cov.get_data() - num_lines = _idx + 1 + ds = [] + for filename in data.measured_files(): + try: + with open(os.path.join(self.git_dir, filename)) as f: + total_lines = sum(1 for _ in f) + lines_covered = len(data.lines(filename) or []) + short_filename = filename.replace(self.git_dir + os.sep, "") + ds.append([short_filename, lines_covered, total_lines]) + except OSError as e: + logger.warning(f"Could not process coverage for file {filename}: {e}") - try: - short_filename = filename.split(self.git_dir + os.sep)[1] - ds.append([short_filename, len(cov["lines"][filename]), num_lines]) - except IndexError: - logger.warning(f"Could not determine relative path for file {filename} during coverage analysis.") + if not ds: + return DataFrame(columns=["filename", "lines_covered", "total_lines", "coverage"]) - df = DataFrame(ds, columns=["filename", "lines_covered", "total_lines"]) - df["coverage"] = df["lines_covered"] / df["total_lines"] - df = self._add_labels_to_df(df) + df = DataFrame(ds, columns=["filename", "lines_covered", "total_lines"]) + df["coverage"] = df["lines_covered"] / df["total_lines"] + df = self._add_labels_to_df(df) - return df + return df + + except Exception as e: + logger.error(f"Failed to analyze coverage data: {e}", exc_info=True) + return DataFrame(columns=["filename", "lines_covered", "total_lines", "coverage"]) def hours_estimate( self, @@ -1497,6 +1486,220 @@ def __str__(self): """ return f"git repository: {self._repo_name()} at: {self.git_dir}" + def get_commit_content(self, rev, ignore_globs=None, include_globs=None): + """Gets detailed content changes for a specific commit. + + For each file changed in the commit, returns the actual content changes + including added and removed lines. + + Args: + rev (str): Revision (commit hash) to analyze + ignore_globs (Optional[List[str]]): List of glob patterns for files to ignore + include_globs (Optional[List[str]]): List of glob patterns for files to include + + Returns: + pandas.DataFrame: A DataFrame with columns: + - file (str): Path of the changed file + - change_type (str): Type of change (A=added, M=modified, D=deleted) + - old_line_num (int): Line number in the old version (None for added lines) + - new_line_num (int): Line number in the new version (None for deleted lines) + - content (str): The actual line content + - repository (str): Repository name + Additional columns for any labels specified in labels_to_add + + Note: + For binary files, only the change_type is recorded, with no line-by-line changes. + If both ignore_globs and include_globs are provided, files must match an include + pattern and not match any ignore patterns to be included. + """ + logger.info(f"Getting detailed content changes for revision '{rev}'") + + try: + commit = self.repo.commit(rev) + + # Get the parent commit. For merge commits, use first parent + parent = commit.parents[0] if commit.parents else None + parent_sha = parent.hexsha if parent else "4b825dc642cb6eb9a060e54bf8d69288fbee4904" # empty tree + + # Get the diff between this commit and its parent + diff = self.repo.git.diff( + parent_sha, + commit.hexsha, + "--unified=0", # No context lines + "--no-prefix", # Don't prefix with a/ and b/ + "--no-renames", # Don't try to detect renames + ) + + changes = [] + current_file = None + current_type = None + + for line in diff.split("\n"): + if line.startswith("diff --git"): + # New file being processed + file_path = line.split(" ")[-1] + + # Check if this file should be included based on globs + if not self.__check_extension({file_path: None}, ignore_globs, include_globs): + current_file = None + continue + + current_file = file_path + + elif line.startswith("new file"): + current_type = "A" + elif line.startswith("deleted"): + current_type = "D" + elif line.startswith("index"): + current_type = "M" + elif line.startswith("@@") and current_file: + # Parse the @@ line to get line numbers + # Format: @@ -old_start,old_count +new_start,new_count @@ + nums = line.split("@@")[1].strip().split(" ") + old_range = nums[0].split(",") + new_range = nums[1].split(",") + + old_start = int(old_range[0].lstrip("-")) + new_start = int(new_range[0].lstrip("+")) + + elif line.startswith("+") and current_file and not line.startswith("+++"): + # Added line + changes.append( + [ + current_file, + current_type, + None, # old line number + new_start, + line[1:], # Remove the + prefix + ] + ) + new_start += 1 + + elif line.startswith("-") and current_file and not line.startswith("---"): + # Removed line + changes.append( + [ + current_file, + current_type, + old_start, + None, # new line number + line[1:], # Remove the - prefix + ] + ) + old_start += 1 + + if not changes: + logger.info(f"No changes found in revision '{rev}' matching the filters") + return DataFrame(columns=["file", "change_type", "old_line_num", "new_line_num", "content"]) + + df = DataFrame(changes, columns=["file", "change_type", "old_line_num", "new_line_num", "content"]) + df = self._add_labels_to_df(df) + + logger.info(f"Found {len(df)} line changes in revision '{rev}'") + return df + + except (GitCommandError, IndexError, BadObject, BadName) as e: + logger.error(f"Failed to get content changes for revision '{rev}': {e}") + return DataFrame(columns=["file", "change_type", "old_line_num", "new_line_num", "content"]) + + def get_file_content(self, path, rev="HEAD"): + """Gets the content of a file from the repository at a specific revision. + + Safely retrieves file content by first verifying the file exists in git's + tree (respecting .gitignore) before attempting to read it. + + Args: + path (str): Path to the file relative to repository root + rev (str, optional): Revision to get file from. Defaults to 'HEAD'. + + Returns: + Optional[str]: Content of the file if it exists and is tracked by git, + None if file doesn't exist or isn't tracked. + + Note: + This only works for files that are tracked by git. Untracked files and + files matched by .gitignore patterns cannot be read. + """ + logger.info(f"Getting content of file '{path}' at revision '{rev}'") + + try: + # First verify the file exists in git's tree + try: + # ls-tree -r for recursive, --full-name for full paths + # -l for long format (includes size) + self.repo.git.ls_tree("-r", "-l", "--full-name", rev, path) + except GitCommandError: + logger.warning(f"File '{path}' not found in git tree at revision '{rev}'") + return None + + # If we get here, the file exists in git's tree + # Use git show to get the file content + content = self.repo.git.show(f"{rev}:{path}") + return content + + except GitCommandError as e: + logger.error(f"Failed to get content of file '{path}' at revision '{rev}': {e}") + return None + + def list_files(self, rev="HEAD"): + """Lists all files in the repository at a specific revision, respecting .gitignore. + + Uses git ls-tree to get a list of all tracked files in the repository, + which automatically respects .gitignore rules since untracked and ignored + files are not in git's tree. + + Args: + rev (str, optional): Revision to list files from. Defaults to 'HEAD'. + + Returns: + pandas.DataFrame: A DataFrame with columns: + - file (str): Full path to the file relative to repository root + - mode (str): File mode (100644 for regular file, 100755 for executable, etc) + - type (str): Object type (blob for file, tree for directory) + - sha (str): SHA-1 hash of the file content + - repository (str): Repository name + Additional columns for any labels specified in labels_to_add + + Note: + This only includes files that are tracked by git. Untracked files and + files matched by .gitignore patterns are not included. + """ + logger.info(f"Listing files at revision '{rev}'") + + try: + # Get the full file list with details using ls-tree + # -r for recursive + # -l for long format (includes file size) + # --full-tree to start from root + # --full-name for full paths + output = self.repo.git.ls_tree("-r", "-l", "--full-tree", "--full-name", rev) + + if not output.strip(): + logger.info("No files found in repository") + return DataFrame(columns=["file", "mode", "type", "sha"]) + + # Parse the ls-tree output + # Format: \t + files = [] + for line in output.split("\n"): + if not line.strip(): + continue + + # Split on tab first to separate path from rest + details, path = line.split("\t") + mode, obj_type, sha, _ = details.split() + files.append([path, mode, obj_type, sha]) + + df = DataFrame(files, columns=["file", "mode", "type", "sha"]) + df = self._add_labels_to_df(df) + + logger.info(f"Found {len(df)} files at revision '{rev}'") + return df + + except GitCommandError as e: + logger.error(f"Failed to list files at revision '{rev}': {e}") + return DataFrame(columns=["file", "mode", "type", "sha"]) + def __repr__(self): """Returns a unique string representation of the repository. diff --git a/pyproject.toml b/pyproject.toml index 90bfbee..fd74ee5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ dependencies = [ "pandas>=2.0.0", "requests", "redis", + "coverage>=5.0.0", ] [project.urls] diff --git a/tests/test_Repository/test_file_operations.py b/tests/test_Repository/test_file_operations.py new file mode 100644 index 0000000..17398b7 --- /dev/null +++ b/tests/test_Repository/test_file_operations.py @@ -0,0 +1,161 @@ +import git +import pandas as pd +import pytest + +from gitpandas import Repository + + +@pytest.fixture +def local_repo(tmp_path): + """Create a local git repository with various file types and structures.""" + repo_path = tmp_path / "test_repo" + repo_path.mkdir() + repo = git.Repo.init(repo_path) + + # Configure git user + repo.config_writer().set_value("user", "name", "Test User").release() + repo.config_writer().set_value("user", "email", "test@example.com").release() + + # Create and checkout master branch + repo.git.checkout("-b", "master") + + # Create initial structure + (repo_path / "src").mkdir() + (repo_path / "docs").mkdir() + (repo_path / "tests").mkdir() + + # Create various files + files = { + "README.md": "# Test Repository\nA test repository for gitpandas.", + "src/main.py": "def main():\n print('Hello, World!')\n return True", + "src/utils.py": "def helper():\n return 'helper'", + "docs/index.md": "# Documentation\nThis is the documentation.", + "tests/test_main.py": "def test_main():\n assert True", + ".gitignore": "*.pyc\n__pycache__/\n.DS_Store", + } + + # Create and commit files + for path, content in files.items(): + file_path = repo_path / path + file_path.write_text(content) + repo.index.add([str(file_path)]) + + repo.index.commit("Initial commit") + + # Create some ignored files + (repo_path / "src/main.pyc").write_text("compiled python") + (repo_path / "src/__pycache__").mkdir() + (repo_path / "src/__pycache__/main.cpython-39.pyc").write_text("cached python") + + # Make a change to test commit content + main_py = repo_path / "src/main.py" + main_py.write_text("def main():\n print('Hello, Universe!')\n return True") + repo.index.add([str(main_py)]) + commit = repo.index.commit("Update greeting") + + return {"repo_path": repo_path, "repo": Repository(working_dir=str(repo_path)), "last_commit": commit.hexsha} + + +class TestFileOperations: + def test_list_files(self, local_repo): + """Test listing files in the repository.""" + repo = local_repo["repo"] + + # Get all files + files = repo.list_files() + + # Check basic DataFrame properties + assert isinstance(files, pd.DataFrame) + assert "file" in files.columns + assert "mode" in files.columns + assert "type" in files.columns + assert "sha" in files.columns + assert "repository" in files.columns + + # Check that we have the expected files + file_paths = set(files["file"].values) + expected_files = { + "README.md", + "src/main.py", + "src/utils.py", + "docs/index.md", + "tests/test_main.py", + ".gitignore", + } + assert file_paths == expected_files + + # Check that ignored files are not included + assert "src/main.pyc" not in file_paths + assert "src/__pycache__/main.cpython-39.pyc" not in file_paths + + # Check file types + assert all(files["type"] == "blob") # All should be files, not trees + + # Check file modes (should be regular files) + assert all(files["mode"].isin(["100644"])) + + def test_get_file_content(self, local_repo): + """Test getting file content from the repository.""" + repo = local_repo["repo"] + + # Test getting content of an existing file + content = repo.get_file_content("src/main.py") + assert content == "def main():\n print('Hello, Universe!')\n return True" + + # Test getting content at a specific revision (first commit) + first_content = repo.get_file_content("src/main.py", rev="HEAD^") + assert first_content == "def main():\n print('Hello, World!')\n return True" + + # Test getting content of a non-existent file + assert repo.get_file_content("nonexistent.txt") is None + + # Test getting content of an ignored file + assert repo.get_file_content("src/main.pyc") is None + + # Test getting content with invalid revision + assert repo.get_file_content("src/main.py", rev="invalid_rev") is None + + def test_get_commit_content(self, local_repo): + """Test getting detailed content changes from a commit.""" + repo = local_repo["repo"] + commit_sha = local_repo["last_commit"] + + # Get changes from the last commit + changes = repo.get_commit_content(commit_sha) + + # Check basic DataFrame properties + assert isinstance(changes, pd.DataFrame) + assert "file" in changes.columns + assert "change_type" in changes.columns + assert "old_line_num" in changes.columns + assert "new_line_num" in changes.columns + assert "content" in changes.columns + assert "repository" in changes.columns + + # Check that we have the expected changes + assert len(changes) > 0 + file_changes = changes[changes["file"] == "src/main.py"] + assert len(file_changes) > 0 + + # Check for removed line + removed = file_changes[file_changes["old_line_num"].notna()] + assert len(removed) == 1 + assert "Hello, World!" in removed.iloc[0]["content"] + + # Check for added line + added = file_changes[file_changes["new_line_num"].notna()] + assert len(added) == 1 + assert "Hello, Universe!" in added.iloc[0]["content"] + + # Test with glob filters + # Should find no changes when excluding .py files + filtered = repo.get_commit_content(commit_sha, ignore_globs=["*.py"]) + assert len(filtered) == 0 + + # Should find changes when including only .py files + filtered = repo.get_commit_content(commit_sha, include_globs=["*.py"]) + assert len(filtered) > 0 + + # Test with invalid commit + invalid = repo.get_commit_content("invalid_sha") + assert len(invalid) == 0