filename_counter: Counter = Counter()
all_json_keys: Counter = Counter()
samples_for_show: Checklist = []
for i, row in enumerate(tqdm(ds_test, desc=”inspecting construction”, whole=200)):
if i >= 200:
break
p = parse_task(row[“task_binary”])
if p[“format”] in (“tar”, “zip”):
for title, physique in p[“files”].gadgets():
filename_counter[name] += 1
if title.endswith(“.json”) and isinstance(physique, str):
attempt:
obj = json.masses(physique)
if isinstance(obj, dict):
for ok in obj.keys():
all_json_keys[k] += 1
besides Exception:
move
if len(samples_for_show) < 2:
samples_for_show.append((row[“path”], p))
print(“nMost widespread filenames inside activity archives:”)
for title, n in filename_counter.most_common(15):
print(f” {n:>4} {title}”)
print(“nMost widespread top-level JSON keys (throughout any *.json):”)
for ok, n in all_json_keys.most_common(20):
print(f” {n:>4} {ok}”)
if samples_for_show:
print(f”nFull file itemizing for one pattern activity ({samples_for_show[0][0]}):”)
for title, physique in samples_for_show[0][1][“files”].gadgets():
sz = len(physique) if isinstance(physique, (str, bytes)) else 0
print(f” {title} ({sz:,} B)”)
VERIFIER_FILE_PATTERNS = (“verifier”, “confirm”, “grader”, “choose”, “rating”, “eval”)
VERIFIER_JSON_KEYS = (“verifier”, “verifier_config”, “choose”, “grader”,
“rubric”, “test_patch”, “FAIL_TO_PASS”, “checks”)
def has_verifier(parsed: Dict[str, Any]) -> bool:
“””Detect verifiers through filename, JSON content material, or each.”””
if parsed[“format”] not in (“tar”, “zip”):
c = parsed.get(“content material”)
if isinstance(c, dict):
return any(ok in c for ok in VERIFIER_JSON_KEYS)
return False
recordsdata = parsed[“files”]
for title in recordsdata:
low = title.decrease()
if any(pat in low for pat in VERIFIER_FILE_PATTERNS):
return True
for title, physique in recordsdata.gadgets():
if title.endswith((“.json”, “.yaml”, “.yml”)) and isinstance(physique, str):
attempt:
obj = json.masses(physique)
if isinstance(obj, dict) and any(ok in obj for ok in VERIFIER_JSON_KEYS):
return True
besides Exception:
move
low = physique.decrease()
if “verifier” in low or “test_patch” in low:
return True
return False
class TaskTroveExplorer:
“””Excessive-level interface to the open-thoughts/TaskTrove dataset.”””
def __init__(self, cut up: str = “take a look at”, dataset_id: str = DATASET_ID):
self.dataset_id = dataset_id
self.cut up = cut up
self._ds = load_dataset(dataset_id, cut up=cut up, streaming=True)
def iter(self, restrict: Elective[int] = None,
source_filter: Elective[str] = None) -> Iterator[Dict[str, Any]]:
rx = re.compile(source_filter) if source_filter else None
n = 0
for row in self._ds:
if rx and never rx.search(source_of(row[“path”])):
proceed
yield row
n += 1
if restrict just isn’t None and n >= restrict:
return
def pattern(self, n: int = 5,
source_filter: Elective[str] = None) -> Checklist[Dict[str, Any]]:
out = []
for row in self.iter(restrict=n, source_filter=source_filter):
parsed = parse_task(row[“task_binary”])
parsed[“path”] = row[“path”]
parsed[“source”] = source_of(row[“path”])
out.append(parsed)
return out
def abstract(self, restrict: int = 1000,
source_filter: Elective[str] = None) -> pd.DataFrame:
rows = []
for row in self.iter(restrict=restrict, source_filter=source_filter):
parsed = parse_task(row[“task_binary”])
rows.append({
“supply”: source_of(row[“path”]),
“compressed”: parsed[“compressed_size”],
“uncooked”: parsed[“raw_size”],
“format”: parsed[“format”],
“n_files”: len(parsed.get(“recordsdata”, {})),
“has_verifier”: has_verifier(parsed),
})
df = pd.DataFrame(rows)
if df.empty:
return df
return (df.groupby(“supply”)
.agg(n=(“compressed”, “depend”),
mean_compressed_kb=(“compressed”, lambda s: s.imply()/1024),
mean_raw_kb=(“uncooked”, lambda s: s.imply()/1024),
mean_n_files=(“n_files”, “imply”),
verifier_rate=(“has_verifier”, “imply”))
.spherical(2)
.sort_values(“n”, ascending=False))
@staticmethod
def has_verifier(parsed: Dict[str, Any]) -> bool:
return has_verifier(parsed)
def export(self, output_dir: Union[str, Path], n: int = 10,
source_filter: Elective[str] = None) -> Path:
output_dir = Path(output_dir)
output_dir.mkdir(dad and mom=True, exist_ok=True)
for parsed in self.pattern(n=n, source_filter=source_filter):
slug = parsed[“path”].exchange(“/”, “_”)
tdir = output_dir / slug
tdir.mkdir(exist_ok=True)
if parsed[“format”] in (“tar”, “zip”):
for title, physique in parsed[“files”].gadgets():
out = tdir / title
out.dad or mum.mkdir(dad and mom=True, exist_ok=True)
if isinstance(physique, str):
out.write_text(physique, encoding=”utf-8″)
else:
out.write_bytes(physique)
else:
content material = parsed.get(“content material”, b””)
if isinstance(content material, (dict, checklist)):
(tdir / “activity.json”).write_text(json.dumps(content material, indent=2))
elif isinstance(content material, str):
(tdir / “activity.txt”).write_text(content material)
else:
(tdir / “activity.bin”).write_bytes(content material)
print(f”✓ exported duties to {output_dir.resolve()}”)
return output_dir
explorer = TaskTroveExplorer(cut up=”take a look at”)
print(“nSample of three parsed duties:”)
for s in explorer.pattern(n=3):
print(f”path: {s[‘path’]} | supply: {s[‘source’]} | format: {s[‘format’]} | ”
f”recordsdata: {len(s.get(‘recordsdata’, {}))} | verifier: {has_verifier(s)}”)

