One star knowing it all...
# HAVING DATA:
DATA = {
'a': 1.5,
'b': 1458266965.250572,
'c': [{'x': {'y': 'LT121000011101001000'}}, {'z': 'Omega'}]}
# GETTING KNOWLEDGE:
SCHEMA = {
'a': 'price#EUR|to.decimal',
'b': 'timestamp#date|to.unixtime',
'c': [{'*': 'contributions',
'x': {'*': 'origins', 'y': 'account#IBAN|to.string'},
'z': 'company#name|to.string'}],
}
# NORMALIZATION
metaform.load(DATA).format(SCHEMA)
# OR USE '*' TO REFERENCE SCHEMA TO PACK SCHEMA WITH DATA PACKETS:
metaform.load(dict(DATA, **{'*': 'https://github.com/wefindx/schema/wiki/Sale#test'})).format()
Metaform is a package for hierarchical and nested data normalization.
pip install metaform
import metaform
# INPUT
metaform.load({
'*': 'https://github.com/mindey/terms/wiki/person#foaf',
'url': 'http://dbpedia.org/resource/John_Lennon',
'fullname': 'John Lennon',
'birthdate': '1940-10-09',
'spouse': 'http://dbpedia.org/resource/Cynthia_Lennon'
}).format(refresh=True)
# (schemas are cached locally, pass refresh=True to redownload)
# OUTPUT
{
'*': 'GH:mindey/terms/person#foaf',
'jsonld:id': 'http://dbpedia.org/resource/John_Lennon',
'foaf:name': 'John Lennon',
'schema:birthDate': datetime.datetime(1940, 10, 9, 0, 0),
'schema:spouse': 'http://dbpedia.org/resource/Cynthia_Lennon'
}
data = metaform.load('https://www.metaculus.com/api2/questions/')
data['*'] ='https://github.com/wefindx/schema/wiki/Topic#metaculuscom-question'
data.format()
# Try it!
df = metaform.read_csv(
'https://gist.githubusercontent.com/mindey/3f2596e108a5c151f32e1967275a7689/raw/7c4c963219255008fdb438e8b9777cd658eea02e/hello-world.csv',
schema={
0: 'Timestamp|to.unixtime',
1: 'KeyUpOrDown|lambda x: x=="k↓" and "KeyDown" or (x=="k↑" and "KeyUp")',
2: 'KeyName'},
header=None
)
Alternatively, save schema to wiki like here, and include the schema token inside filename by encoding it as sub-extension, that is, rename hello-world.csv
to hello-world.GH~mindey+schema+KeyEvent@mykeylogger-01.csv
:
# To get schema token for filename (GH~mindey+schema+KeyEvent@mykeylogger-01) do:
metaform.metawiki.url2ext('https://github.com/mindey/schema/wiki/KeyEvent#mykeylogger-01')
# Then rename filename in the source, and just read file remotely or locally from disk:
df = metaform.read_csv('https://gist.githubusercontent.com/mindey/f33978b31468097b5003f032d5d85eb8/raw/9541191e4d99c052a7668223697ef0ef9ce37977/hello-world.GH~mindey+schema+KeyEvent@mykeylogger-01.csv', header=None)
metaform.load( DATA ).format( SCHEMA )
Let’s say we have some data:
data = {
'hello': 1.0,
'world': 2,
'how': ['is', {'are': {'you': 'doing'}}]
}
We can get the template for defining schema, by metaform.template
:
metaform.template(data)
{'*': '',
'hello': {'*': ''},
'how': [{'*': '', 'are': {'you': {'*': ''}}}],
'world': {'*': ''}}
This provides an opportunity to specify metadata for each key and the object itself. For example:
schema = { # A # schema = {
'*': 'greeting', # L # '*': 'greeting',
'hello': {'*': 'length'}, # T # 'hello': 'length',
'world': {'*': 'atoms'}, # E # 'world': 'atoms',
'how': [ # R # 'how': [
{'*': 'method', # N # {'*': 'method',
'are': { # A # 'are': {
'*': 'yup', # T # '*': 'yup',
'you': {'*': 'me'}} # I # 'you': {'*': 'me'}}
} # V # }
]} # E # ]}
metaform.normalize(data, schema)
{'atoms': 2, 'length': 1.0, 'method': ['is', {'yup': {'me': 'doing'}}]}
We recommend saving schemas you create for normalizations for data analytics and driver projects in dot-folders .schema
, in a JSON or YAML files in that folder.
So, we have access to all keys, and can specify, what to do with them:
schema = {
'*': 'greeting',
'hello': 'length|lambda x: x+5.',
'world': 'atoms|lambda x: str(x)+"ABC"',
'how': [
{'*': 'method',
'are': {
'*': 'yup',
'you': {'*': 'me|lambda x: "-".join(list(x))'}}
}
]}
metaform.normalize(data, schema)
{'atoms': '2ABC',
'length': 6.0,
'method': ['is', {'yup': {'me': 'd-o-i-n-g'}}]}
And suppose, we want to define a more complex function, inconvenient via lambdas:
from metaform import converters
def some_func(x):
a = 123
b = 345
return (b-a)*x
converters.func = some_func
schema = {
'*': 'greeting',
'hello': 'length|to.func',
'world': 'atoms|lambda x: str(x)+"ABC"',
'how': [
{'*': 'method',
'are': {
'*': 'yup',
'you': {'*': 'me|lambda x: "-".join(list(x))'}}
}
]}
metaform.normalize(data, schema)
{'atoms': '2ABC',
'length': 222.0,
'method': ['is', {'yup': {'me': 'd-o-i-n-g'}}]}
We just renamed the keys, and normalized values! What else could we want?
Suppose we have similar data from different sources. For example, topics and comments are not so different after all, because if a comment becomes large enough, it can stand as a topic of its own.
topics = requests.get('https://api.infty.xyz/topics/?format=json').json()['results']
comments = requests.get('https://api.infty.xyz/comments/?format=json').json()['results']
Let’s define templates for them, with the key names and types to match:
topics_schema = [{
'id': 'topic-id',
'type': '|lambda x: {0: "NEED", 1: "GOAL", 2: "IDEA", 3: "PLAN", 4: "STEP", 5: "TASK"}.get(x)',
'owner': {'id': 'user-id'},
'blockchain': '|lambda x: x and True or False',
}]
normal_topics = metaform.normalize(topics, topics_schema)
topics_df = pandas.io.json.json_normalize(normal_topics)
topics_df.dtypes
blockchain bool
body object
categories object
categories_names object
children object
comment_count int64
created_date object
data object
declared float64
editors object
funds float64
is_draft bool
languages object
matched float64
owner.user-id int64
owner.username object
parents object
title object
topic-id int64
type object
updated_date object
url object
dtype: object
comments_schema = [{
'id': 'comment-id',
'topic': 'topic-url',
'text': 'body',
'owner': {'id': 'user-id'},
'blockchain': '|lambda x: x and True or False',
}]
normal_comments = metaform.normalize(comments, comments_schema)
comments_df = pandas.io.json.json_normalize(normal_comments)
comments_df.dtypes
assumed_hours object
blockchain bool
body object
claimed_hours object
comment-id int64
created_date object
donated float64
languages object
matched float64
owner.user-id int64
owner.username object
parent object
remains float64
topic-url object
updated_date object
url object
dtype: object
df = pandas.concat([topics_df, comments_df], sort=False)