@@ -491,4 +491,89 @@ def _join_condition(
491491 right : typed_expr .TypedExpr ,
492492 joins_nulls : bool ,
493493) -> typing .Union [sge .EQ , sge .And ]:
494- return sge .EQ (this = left .expr , expression = right .expr )
494+ """Generates a join condition to match pandas's null-handling logic.
495+
496+ Pandas treats null values as distinct from each other, leading to a
497+ cross-join-like behavior for null keys. In contrast, BigQuery SQL treats
498+ null values as equal, leading to a inner-join-like behavior.
499+
500+ This function generates the appropriate SQL condition to replicate the
501+ desired pandas behavior in BigQuery.
502+
503+ Args:
504+ left: The left-side join key.
505+ right: The right-side join key.
506+ joins_nulls: If True, generates complex logic to handle nulls/NaNs.
507+ Otherwise, uses a simple equality check where appropriate.
508+ """
509+ is_floating_types = (
510+ left .dtype == dtypes .FLOAT_DTYPE and right .dtype == dtypes .FLOAT_DTYPE
511+ )
512+ if not is_floating_types and not joins_nulls :
513+ return sge .EQ (this = left .expr , expression = right .expr )
514+
515+ is_numeric_types = dtypes .is_numeric (
516+ left .dtype , include_bool = False
517+ ) and dtypes .is_numeric (right .dtype , include_bool = False )
518+ if is_numeric_types :
519+ return _join_condition_for_numeric (left , right )
520+ else :
521+ return _join_condition_for_others (left , right )
522+
523+
524+ def _join_condition_for_others (
525+ left : typed_expr .TypedExpr ,
526+ right : typed_expr .TypedExpr ,
527+ ) -> sge .And :
528+ """Generates a join condition for non-numeric types to match pandas's
529+ null-handling logic.
530+ """
531+ left_str = _cast (left .expr , "STRING" )
532+ right_str = _cast (right .expr , "STRING" )
533+ left_0 = sge .func ("COALESCE" , left_str , _literal ("0" , dtypes .STRING_DTYPE ))
534+ left_1 = sge .func ("COALESCE" , left_str , _literal ("1" , dtypes .STRING_DTYPE ))
535+ right_0 = sge .func ("COALESCE" , right_str , _literal ("0" , dtypes .STRING_DTYPE ))
536+ right_1 = sge .func ("COALESCE" , right_str , _literal ("1" , dtypes .STRING_DTYPE ))
537+ return sge .And (
538+ this = sge .EQ (this = left_0 , expression = right_0 ),
539+ expression = sge .EQ (this = left_1 , expression = right_1 ),
540+ )
541+
542+
543+ def _join_condition_for_numeric (
544+ left : typed_expr .TypedExpr ,
545+ right : typed_expr .TypedExpr ,
546+ ) -> sge .And :
547+ """Generates a join condition for non-numeric types to match pandas's
548+ null-handling logic. Specifically for FLOAT types, Pandas treats NaN aren't
549+ equal so need to coalesce as well with different constants.
550+ """
551+ is_floating_types = (
552+ left .dtype == dtypes .FLOAT_DTYPE and right .dtype == dtypes .FLOAT_DTYPE
553+ )
554+ left_0 = sge .func ("COALESCE" , left .expr , _literal (0 , left .dtype ))
555+ left_1 = sge .func ("COALESCE" , left .expr , _literal (1 , left .dtype ))
556+ right_0 = sge .func ("COALESCE" , right .expr , _literal (0 , right .dtype ))
557+ right_1 = sge .func ("COALESCE" , right .expr , _literal (1 , right .dtype ))
558+ if not is_floating_types :
559+ return sge .And (
560+ this = sge .EQ (this = left_0 , expression = right_0 ),
561+ expression = sge .EQ (this = left_1 , expression = right_1 ),
562+ )
563+
564+ left_2 = sge .If (
565+ this = sge .IsNan (this = left .expr ), true = _literal (2 , left .dtype ), false = left_0
566+ )
567+ left_3 = sge .If (
568+ this = sge .IsNan (this = left .expr ), true = _literal (3 , left .dtype ), false = left_1
569+ )
570+ right_2 = sge .If (
571+ this = sge .IsNan (this = right .expr ), true = _literal (2 , right .dtype ), false = right_0
572+ )
573+ right_3 = sge .If (
574+ this = sge .IsNan (this = right .expr ), true = _literal (3 , right .dtype ), false = right_1
575+ )
576+ return sge .And (
577+ this = sge .EQ (this = left_2 , expression = right_2 ),
578+ expression = sge .EQ (this = left_3 , expression = right_3 ),
579+ )
0 commit comments