I have a DataFrame (database_df) that contains the general record with the IDs that are the same team in each of the lines, containing these values I need to find in two APIs (api_1_df, api_2_df) that I collect the data if there is a way to find the same home team and the away team on the same line in both APIs.
When found on the same lines, I need to generate a dataframe with the lines in common with the columns that demonstrate the game's identification code (match_id):
import pandas as pd
api_1_id = "pressure_id"
api_2_id = "betfair_id"
api_1_match_id = "pressure_match_id"
api_2_match_id = "betfair_match_id"
database_df = pd.DataFrame({
'pressure_id': [101, 102, 103, 201, 202, 203],
'pressure_name': ["Rangers", "City", "Barcelona FC", "Real Madrid FC", "Liverpool", "Chelsea FC"],
'betfair_id': [1001, 1002, 1003, 2001, 2002, 2003],
'betfair_name': ["Rangers FC", "Manchester City", "Barcelona FC", "Real Madrid", "Liverpool FC", "Chelsea"]
})
api_1_df = pd.DataFrame({
'match_id': [1, 3],
'home_id': [101, 103],
'home_name': ["Rangers", "Barcelona FC"],
'away_id': [201, 203],
'away_name': ["Real Madrid FC", "Chelsea FC"]
})
api_2_df = pd.DataFrame({
'match_id': [123, 456, 789],
'home_id': [1001, 1002, 1003],
'home_name': ["Rangers", "City", "Barcelona FC"],
'away_id': [2001, 2002, 2003],
'away_name': ["Real Madrid", "Liverpool FC", "Chelsea"]
})
def api_1_and_api_2_ids(api_1_df: pd.DataFrame, database_df: pd.DataFrame, api_2_df: pd.DataFrame) -> None:
result = []
for _, row in api_1_df.iterrows():
home_id: pd.DataFrame = database_df.loc[database_df[api_1_id] == row['home_id']]
away_id: pd.DataFrame = database_df.loc[database_df[api_1_id] == row['away_id']]
home_id = home_id[api_2_id].iloc[0]
away_id = away_id[api_2_id].iloc[0]
matched_rows: pd.DataFrame = api_2_df[(api_2_df['home_id'] == home_id) & (api_2_df['away_id'] == away_id)]
if not matched_rows.empty:
result.append({api_1_match_id: row['match_id'], api_2_match_id: matched_rows.iloc[0]['match_id']})
result_df = pd.DataFrame(result, columns=[api_1_match_id, api_2_match_id])
result_df = result_df.sort_values(by=[api_1_match_id])
print(result_df)
Result:
pressure_match_id betfair_match_id
1 123
3 789
I would like to improve this method that I am using because it is slow in a large dataframe and I would also like indications of visual improvement because I am finding it very messy and difficult to understand.