A Coding Implementation to Discover and Analyze the TaskTrove Dataset with Streaming Parsing Visualization and Verifier Detection

filename_counter: Counter = Counter()
all_json_keys: Counter = Counter()
samples_for_show: Checklist = []

for i, row in enumerate(tqdm(ds_test, desc=”inspecting construction”, whole=200)):
if i >= 200:
break
p = parse_task(row[“task_binary”])
if p[“format”] in (“tar”, “zip”):
for title, physique in p[“files”].gadgets():
filename_counter[name] += 1
if title.endswith(“.json”) and isinstance(physique, str):
attempt:
obj = json.masses(physique)
if isinstance(obj, dict):
for ok in obj.keys():
all_json_keys[k] += 1
besides Exception:
move
if len(samples_for_show) < 2:
samples_for_show.append((row[“path”], p))

print(“nMost widespread filenames inside activity archives:”)
for title, n in filename_counter.most_common(15):
print(f” {n:>4} {title}”)

print(“nMost widespread top-level JSON keys (throughout any *.json):”)
for ok, n in all_json_keys.most_common(20):
print(f” {n:>4} {ok}”)

if samples_for_show:
print(f”nFull file itemizing for one pattern activity ({samples_for_show[0][0]}):”)
for title, physique in samples_for_show[0][1][“files”].gadgets():
sz = len(physique) if isinstance(physique, (str, bytes)) else 0
print(f” {title} ({sz:,} B)”)

VERIFIER_FILE_PATTERNS = (“verifier”, “confirm”, “grader”, “choose”, “rating”, “eval”)
VERIFIER_JSON_KEYS = (“verifier”, “verifier_config”, “choose”, “grader”,
“rubric”, “test_patch”, “FAIL_TO_PASS”, “checks”)

def has_verifier(parsed: Dict[str, Any]) -> bool:
“””Detect verifiers through filename, JSON content material, or each.”””
if parsed[“format”] not in (“tar”, “zip”):
c = parsed.get(“content material”)
if isinstance(c, dict):
return any(ok in c for ok in VERIFIER_JSON_KEYS)
return False

recordsdata = parsed[“files”]

for title in recordsdata:
low = title.decrease()
if any(pat in low for pat in VERIFIER_FILE_PATTERNS):
return True

for title, physique in recordsdata.gadgets():
if title.endswith((“.json”, “.yaml”, “.yml”)) and isinstance(physique, str):
attempt:
obj = json.masses(physique)
if isinstance(obj, dict) and any(ok in obj for ok in VERIFIER_JSON_KEYS):
return True
besides Exception:
move
low = physique.decrease()
if “verifier” in low or “test_patch” in low:
return True

return False

class TaskTroveExplorer:
“””Excessive-level interface to the open-thoughts/TaskTrove dataset.”””

def __init__(self, cut up: str = “take a look at”, dataset_id: str = DATASET_ID):
self.dataset_id = dataset_id
self.cut up = cut up
self._ds = load_dataset(dataset_id, cut up=cut up, streaming=True)

def iter(self, restrict: Elective[int] = None,
source_filter: Elective[str] = None) -> Iterator[Dict[str, Any]]:
rx = re.compile(source_filter) if source_filter else None
n = 0
for row in self._ds:
if rx and never rx.search(source_of(row[“path”])):
proceed
yield row
n += 1
if restrict just isn’t None and n >= restrict:
return

def pattern(self, n: int = 5,
source_filter: Elective[str] = None) -> Checklist[Dict[str, Any]]:
out = []
for row in self.iter(restrict=n, source_filter=source_filter):
parsed = parse_task(row[“task_binary”])
parsed[“path”] = row[“path”]
parsed[“source”] = source_of(row[“path”])
out.append(parsed)
return out

def abstract(self, restrict: int = 1000,
source_filter: Elective[str] = None) -> pd.DataFrame:
rows = []
for row in self.iter(restrict=restrict, source_filter=source_filter):
parsed = parse_task(row[“task_binary”])
rows.append({
“supply”: source_of(row[“path”]),
“compressed”: parsed[“compressed_size”],
“uncooked”: parsed[“raw_size”],
“format”: parsed[“format”],
“n_files”: len(parsed.get(“recordsdata”, {})),
“has_verifier”: has_verifier(parsed),
})
df = pd.DataFrame(rows)
if df.empty:
return df
return (df.groupby(“supply”)
.agg(n=(“compressed”, “depend”),
mean_compressed_kb=(“compressed”, lambda s: s.imply()/1024),
mean_raw_kb=(“uncooked”, lambda s: s.imply()/1024),
mean_n_files=(“n_files”, “imply”),
verifier_rate=(“has_verifier”, “imply”))
.spherical(2)
.sort_values(“n”, ascending=False))

@staticmethod
def has_verifier(parsed: Dict[str, Any]) -> bool:
return has_verifier(parsed)

def export(self, output_dir: Union[str, Path], n: int = 10,
source_filter: Elective[str] = None) -> Path:
output_dir = Path(output_dir)
output_dir.mkdir(dad and mom=True, exist_ok=True)
for parsed in self.pattern(n=n, source_filter=source_filter):
slug = parsed[“path”].exchange(“/”, “_”)
tdir = output_dir / slug
tdir.mkdir(exist_ok=True)
if parsed[“format”] in (“tar”, “zip”):
for title, physique in parsed[“files”].gadgets():
out = tdir / title
out.dad or mum.mkdir(dad and mom=True, exist_ok=True)
if isinstance(physique, str):
out.write_text(physique, encoding=”utf-8″)
else:
out.write_bytes(physique)
else:
content material = parsed.get(“content material”, b””)
if isinstance(content material, (dict, checklist)):
(tdir / “activity.json”).write_text(json.dumps(content material, indent=2))
elif isinstance(content material, str):
(tdir / “activity.txt”).write_text(content material)
else:
(tdir / “activity.bin”).write_bytes(content material)
print(f”✓ exported duties to {output_dir.resolve()}”)
return output_dir

explorer = TaskTroveExplorer(cut up=”take a look at”)

print(“nSample of three parsed duties:”)
for s in explorer.pattern(n=3):
print(f”path: {s[‘path’]} | supply: {s[‘source’]} | format: {s[‘format’]} | ”
f”recordsdata: {len(s.get(‘recordsdata’, {}))} | verifier: {has_verifier(s)}”)

What's Hot

Video exhibits United flight strike truck on New Jersey Turnpike earlier than touchdown at Newark Liberty Airport

Pixel 10 instances are discounted proper now for the entire lineup

This Samsung Galaxy laptop computer simply received an enormous £420 worth lower

Beneath the Stars Now Streaming On-line: The place to Watch This Stunning Love Story from Italy

A Developer’s Information to Systematic Prompting: Mastering Unfavorable Constraints, Structured JSON Outputs, and Multi-Speculation Verbalized Sampling

OpenAI provides AI pets to its Codex coding device

AI music is flooding streaming providers — however who desires it?

‘We pleasure ourselves on human curation’: Vocana is an indie music service decided to problem Spotify’s cost system — I spoke with the platform’s President in regards to the firm’s inception, and the way it’s planning to repair music streaming

What’s Tokenization Drift and Repair It?

Video exhibits United flight strike truck on New Jersey Turnpike earlier than touchdown at Newark Liberty Airport

Pixel 10 instances are discounted proper now for the entire lineup

This Samsung Galaxy laptop computer simply received an enormous £420 worth lower

Video exhibits United flight strike truck on New Jersey Turnpike earlier than touchdown at Newark Liberty Airport

Pixel 10 instances are discounted proper now for the entire lineup

This Samsung Galaxy laptop computer simply received an enormous £420 worth lower

Usefull link

categories

What's Hot

Related Posts

Usefull link

categories